1 /** Network Kanji Filter. (PDS Version)
2 ** -*- coding: ISO-2022-JP -*-
3 ************************************************************************
4 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
5 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
6 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
7 ** Copyright (C) 1996,1998
9 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
10 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
11 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
12 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
14 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
15 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
16 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
17 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
18 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
19 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
21 ** Everyone is permitted to do anything on this program
22 ** including copying, modifying, improving,
23 ** as long as you don't try to pretend that you wrote it.
24 ** i.e., the above copyright notice has to appear in all copies.
25 ** Binary distribution requires original version messages.
26 ** You don't have to ask before copying, redistribution or publishing.
27 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
28 ***********************************************************************/
30 /***********************************************************************
31 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
32 * http://sourceforge.jp/projects/nkf/
33 ***********************************************************************/
34 #define NKF_IDENT "$Id: nkf.c,v 1.175 2008/02/07 19:59:13 naruse Exp $"
35 #define NKF_VERSION "2.0.8"
36 #define NKF_RELEASE_DATE "2008-02-07"
38 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
39 "Copyright (C) 2002-2008 Kono, Furukawa, Naruse, mastodon"
45 /* state of output_mode and input_mode
124 NKF_ENCODING_TABLE_SIZE,
125 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
126 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
127 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
128 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
129 JIS_X_0208 = 0x1168, /* @B */
130 JIS_X_0212 = 0x1159, /* D */
131 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
132 JIS_X_0213_2 = 0x1229, /* P */
133 JIS_X_0213_1 = 0x1233, /* Q */
136 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
139 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
140 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
141 void j_oconv(nkf_char c2, nkf_char c1);
142 void s_oconv(nkf_char c2, nkf_char c1);
143 void e_oconv(nkf_char c2, nkf_char c1);
144 void w_oconv(nkf_char c2, nkf_char c1);
145 void w_oconv16(nkf_char c2, nkf_char c1);
146 void w_oconv32(nkf_char c2, nkf_char c1);
150 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
151 void (*oconv)(nkf_char c2, nkf_char c1);
152 } nkf_native_encoding;
154 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
155 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
156 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
157 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
158 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
159 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
160 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
165 const nkf_native_encoding *base_encoding;
168 nkf_encoding nkf_encoding_table[] = {
169 {ASCII, "US-ASCII", &NkfEncodingASCII},
170 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
171 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
172 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
173 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
174 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
177 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
178 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
179 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
180 {CP10001, "CP10001", &NkfEncodingShift_JIS},
181 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
182 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
183 {CP51932, "CP51932", &NkfEncodingEUC_JP},
184 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
185 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
186 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
187 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
188 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
189 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
190 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
191 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
192 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
193 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
194 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
195 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
196 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
197 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
198 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
199 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
200 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
201 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
202 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
203 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
204 {BINARY, "BINARY", &NkfEncodingASCII},
211 } encoding_name_to_id_table[] = {
214 {"ISO-2022-JP", ISO_2022_JP},
215 {"ISO2022JP-CP932", CP50220},
216 {"CP50220", CP50220},
217 {"CP50221", CP50221},
218 {"CSISO2022JP", CP50221},
219 {"CP50222", CP50222},
220 {"ISO-2022-JP-1", ISO_2022_JP_1},
221 {"ISO-2022-JP-3", ISO_2022_JP_3},
222 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
223 {"SHIFT_JIS", SHIFT_JIS},
225 {"WINDOWS-31J", WINDOWS_31J},
226 {"CSWINDOWS31J", WINDOWS_31J},
227 {"CP932", WINDOWS_31J},
228 {"MS932", WINDOWS_31J},
229 {"CP10001", CP10001},
232 {"EUCJP-NKF", EUCJP_NKF},
233 {"CP51932", CP51932},
234 {"EUC-JP-MS", EUCJP_MS},
235 {"EUCJP-MS", EUCJP_MS},
236 {"EUCJPMS", EUCJP_MS},
237 {"EUC-JP-ASCII", EUCJP_ASCII},
238 {"EUCJP-ASCII", EUCJP_ASCII},
239 {"SHIFT_JISX0213", SHIFT_JISX0213},
240 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
241 {"EUC-JISX0213", EUC_JISX0213},
242 {"EUC-JIS-2004", EUC_JIS_2004},
245 {"UTF-8-BOM", UTF_8_BOM},
246 {"UTF8-MAC", UTF8_MAC},
247 {"UTF-8-MAC", UTF8_MAC},
249 {"UTF-16BE", UTF_16BE},
250 {"UTF-16BE-BOM", UTF_16BE_BOM},
251 {"UTF-16LE", UTF_16LE},
252 {"UTF-16LE-BOM", UTF_16LE_BOM},
254 {"UTF-32BE", UTF_32BE},
255 {"UTF-32BE-BOM", UTF_32BE_BOM},
256 {"UTF-32LE", UTF_32LE},
257 {"UTF-32LE-BOM", UTF_32LE_BOM},
262 #if defined(DEFAULT_CODE_JIS)
263 #define DEFAULT_ENCIDX ISO_2022_JP
264 #elif defined(DEFAULT_CODE_SJIS)
265 #define DEFAULT_ENCIDX SHIFT_JIS
266 #elif defined(DEFAULT_CODE_EUC)
267 #define DEFAULT_ENCIDX EUC_JP
268 #elif defined(DEFAULT_CODE_UTF8)
269 #define DEFAULT_ENCIDX UTF_8
273 #define is_alnum(c) \
274 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
276 /* I don't trust portablity of toupper */
277 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
278 #define nkf_isoctal(c) ('0'<=c && c<='7')
279 #define nkf_isdigit(c) ('0'<=c && c<='9')
280 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
281 #define nkf_isblank(c) (c == SP || c == TAB)
282 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
283 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
284 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
285 #define nkf_isprint(c) (SP<=c && c<='~')
286 #define nkf_isgraph(c) ('!'<=c && c<='~')
287 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
288 ('A'<=c&&c<='F') ? (c-'A'+10) : \
289 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
290 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
291 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
292 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
293 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
294 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
296 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
298 #define HOLD_SIZE 1024
299 #if defined(INT_IS_SHORT)
300 #define IOBUF_SIZE 2048
302 #define IOBUF_SIZE 16384
305 #define DEFAULT_J 'B'
306 #define DEFAULT_R 'B'
313 /* MIME preprocessor */
315 #ifdef EASYWIN /*Easy Win */
316 extern POINT _BufferSize;
325 void (*status_func)(struct input_code *, nkf_char);
326 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
330 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
331 static nkf_encoding *input_encoding = NULL;
332 static nkf_encoding *output_encoding = NULL;
334 static int kanji_convert(FILE *f);
335 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
337 * 0: Shift_JIS, eucJP-ascii
342 #define UCS_MAP_ASCII 0
344 #define UCS_MAP_CP932 2
345 #define UCS_MAP_CP10001 3
346 static int ms_ucs_map_f = UCS_MAP_ASCII;
348 #ifdef UTF8_INPUT_ENABLE
349 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
350 static int no_cp932ext_f = FALSE;
351 /* ignore ZERO WIDTH NO-BREAK SPACE */
352 static int no_best_fit_chars_f = FALSE;
353 static int input_endian = ENDIAN_BIG;
354 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
355 static void (*encode_fallback)(nkf_char c) = NULL;
356 static void w_status(struct input_code *, nkf_char);
358 #ifdef UTF8_OUTPUT_ENABLE
359 static int output_bom_f = FALSE;
360 static int output_endian = ENDIAN_BIG;
363 static void std_putc(nkf_char c);
364 static nkf_char std_getc(FILE *f);
365 static nkf_char std_ungetc(nkf_char c,FILE *f);
367 static nkf_char broken_getc(FILE *f);
368 static nkf_char broken_ungetc(nkf_char c,FILE *f);
370 static nkf_char mime_getc(FILE *f);
372 static void mime_putc(nkf_char c);
376 #if !defined(PERL_XS) && !defined(WIN32DLL)
377 static unsigned char stdibuf[IOBUF_SIZE];
378 static unsigned char stdobuf[IOBUF_SIZE];
382 static int unbuf_f = FALSE;
383 static int estab_f = FALSE;
384 static int nop_f = FALSE;
385 static int binmode_f = TRUE; /* binary mode */
386 static int rot_f = FALSE; /* rot14/43 mode */
387 static int hira_f = FALSE; /* hira/kata henkan */
388 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
389 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
390 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
391 static int mimebuf_f = FALSE; /* MIME buffered input */
392 static int broken_f = FALSE; /* convert ESC-less broken JIS */
393 static int iso8859_f = FALSE; /* ISO8859 through */
394 static int mimeout_f = FALSE; /* base64 mode */
395 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
396 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
398 #ifdef UNICODE_NORMALIZATION
399 static int nfc_f = FALSE;
400 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
401 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
405 static int cap_f = FALSE;
406 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
407 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
409 static int url_f = FALSE;
410 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
411 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
414 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
415 #define CLASS_MASK NKF_INT32_C(0xFF000000)
416 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
417 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
418 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
419 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
420 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
421 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
422 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
423 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_BMP_MAX))
424 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= NKF_INT32_C(UNICODE_MAX))
426 #ifdef NUMCHAR_OPTION
427 static int numchar_f = FALSE;
428 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
429 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
433 static int noout_f = FALSE;
434 static void no_putc(nkf_char c);
435 static int debug_f = FALSE;
436 static void debug(const char *str);
437 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
440 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
441 static void set_input_codename(char *codename);
444 static int exec_f = 0;
447 #ifdef SHIFTJIS_CP932
448 /* invert IBM extended characters to others */
449 static int cp51932_f = FALSE;
451 /* invert NEC-selected IBM extended characters to IBM extended characters */
452 static int cp932inv_f = TRUE;
454 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
455 #endif /* SHIFTJIS_CP932 */
457 static int x0212_f = FALSE;
458 static int x0213_f = FALSE;
460 static unsigned char prefix_table[256];
462 static void e_status(struct input_code *, nkf_char);
463 static void s_status(struct input_code *, nkf_char);
465 struct input_code input_code_list[] = {
466 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
467 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
468 #ifdef UTF8_INPUT_ENABLE
469 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
474 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
475 static int base64_count = 0;
477 /* X0208 -> ASCII converter */
480 static int f_line = 0; /* chars in line */
481 static int f_prev = 0;
482 static int fold_preserve_f = FALSE; /* preserve new lines */
483 static int fold_f = FALSE;
484 static int fold_len = 0;
487 static unsigned char kanji_intro = DEFAULT_J;
488 static unsigned char ascii_intro = DEFAULT_R;
492 #define FOLD_MARGIN 10
493 #define DEFAULT_FOLD 60
495 static int fold_margin = FOLD_MARGIN;
497 /* process default */
499 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
501 fprintf(stderr,"nkf internal module connection failure.\n");
506 void no_connection(nkf_char c2, nkf_char c1)
508 no_connection2(c2,c1,0);
511 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
512 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
514 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
515 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
516 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
517 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
518 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
519 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
520 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
522 /* static redirections */
524 static void (*o_putc)(nkf_char c) = std_putc;
526 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
527 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
529 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
530 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
532 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
534 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
535 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
537 /* for strict mime */
538 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
539 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
542 static int output_mode = ASCII; /* output kanji mode */
543 static int input_mode = ASCII; /* input kanji mode */
544 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
546 /* X0201 / X0208 conversion tables */
548 /* X0201 kana conversion table */
550 static const unsigned char cv[]= {
551 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
552 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
553 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
554 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
555 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
556 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
557 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
558 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
559 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
560 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
561 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
562 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
563 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
564 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
565 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
566 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
570 /* X0201 kana conversion table for daguten */
572 static const unsigned char dv[]= {
573 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
574 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
575 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
577 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
578 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
579 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
580 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
581 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
582 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
583 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
584 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
585 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
586 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 /* X0201 kana conversion table for han-daguten */
593 static const unsigned char ev[]= {
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
605 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 /* X0208 kigou conversion table */
614 /* 0x8140 - 0x819e */
615 static const unsigned char fv[] = {
617 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
618 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
619 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
620 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
621 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
622 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
623 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
624 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
625 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
627 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
628 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
633 static int option_mode = 0;
634 static int file_out_f = FALSE;
636 static int overwrite_f = FALSE;
637 static int preserve_time_f = FALSE;
638 static int backup_f = FALSE;
639 static char *backup_suffix = "";
642 static int eolmode_f = 0; /* CR, LF, CRLF */
643 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
644 static nkf_char prev_cr = 0; /* CR or 0 */
645 #ifdef EASYWIN /*Easy Win */
646 static int end_check;
649 #define STD_GC_BUFSIZE (256)
650 nkf_char std_gc_buf[STD_GC_BUFSIZE];
653 char* nkf_strcpy(const char *str)
655 char* result = malloc(strlen(str) + 1);
664 static void nkf_str_upcase(const char *src, char *dest, size_t length)
667 for (; i < length && src[i]; i++) {
668 dest[i] = nkf_toupper(src[i]);
673 static nkf_encoding *nkf_enc_from_index(int idx)
675 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
678 return &nkf_encoding_table[idx];
681 static int nkf_enc_find_index(const char *name)
684 if (*name == 'X' && *(name+1) == '-') name += 2;
685 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
686 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
687 return encoding_name_to_id_table[i].id;
693 static nkf_encoding *nkf_enc_find(const char *name)
696 idx = nkf_enc_find_index(name);
697 if (idx < 0) return 0;
698 return nkf_enc_from_index(idx);
701 #define nkf_enc_name(enc) (enc)->name
702 #define nkf_enc_to_index(enc) (enc)->id
703 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
704 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
705 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
706 #define nkf_enc_asciicompat(enc) (\
707 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
708 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
709 #define nkf_enc_unicode_p(enc) (\
710 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
711 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
712 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
713 #define nkf_enc_cp5022x_p(enc) (\
714 nkf_enc_to_index(enc) == CP50220 ||\
715 nkf_enc_to_index(enc) == CP50221 ||\
716 nkf_enc_to_index(enc) == CP50222)
718 #ifdef DEFAULT_CODE_LOCALE
719 static char* nkf_locale_charmap()
721 #ifdef HAVE_LANGINFO_H
722 return nl_langinfo(CODESET);
723 #elif defined(__WIN32__)
724 return sprintf("CP%d", GetACP());
730 static nkf_encoding* nkf_locale_encoding()
732 nkf_encoding *enc = 0;
733 char *encname = nkf_locale_charmap();
735 enc = nkf_enc_find(encname);
736 if (enc < 0) enc = 0;
739 #endif /* DEFAULT_CODE_LOCALE */
741 static nkf_encoding* nkf_default_encoding()
743 nkf_encoding *enc = 0;
744 #ifdef DEFAULT_CODE_LOCALE
745 enc = nkf_locale_encoding();
747 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
754 #define fprintf dllprintf
759 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
765 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
767 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
768 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
769 #ifdef UTF8_OUTPUT_ENABLE
770 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
772 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
773 #ifdef UTF8_INPUT_ENABLE
774 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
777 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
778 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
779 "r {de/en}crypt ROT13/47\n"
780 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
781 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
782 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
783 "l ISO8859-1 (Latin-1) support\n"
784 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
785 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
786 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
787 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
788 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
789 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
791 "T Text mode output\n"
793 "O Output to File (DEFAULT 'nkf.out')\n"
794 "I Convert non ISO-2022-JP charactor to GETA\n"
795 "d,c Convert line breaks -d: LF -c: CRLF\n"
796 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
797 "v, V Show this usage. V: show configuration\n"
799 "Long name options\n"
800 " --ic=<input codeset> --oc=<output codeset>\n"
801 " Specify the input or output codeset\n"
802 " --fj --unix --mac --windows\n"
803 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
804 " Convert for the system or code\n"
805 " --hiragana --katakana --katakana-hiragana\n"
806 " To Hiragana/Katakana Conversion\n"
807 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
809 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
811 #ifdef NUMCHAR_OPTION
812 " --numchar-input Convert Unicode Character Reference\n"
814 #ifdef UTF8_INPUT_ENABLE
815 " --fb-{skip, html, xml, perl, java, subchar}\n"
816 " Specify how nkf handles unassigned characters\n"
819 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
820 " Overwrite original listed files by filtered result\n"
821 " --overwrite preserves timestamp of original files\n"
823 " -g --guess Guess the input code\n"
824 " --help --version Show this help/the version\n"
825 " For more information, see also man nkf\n"
830 void show_configuration(void)
833 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
836 " Compile-time options:\n"
837 " Compiled at: " __DATE__ " " __TIME__ "\n"
840 " Default output encoding: "
841 #ifdef DEFAULT_CODE_LOCALE
842 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
844 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
850 " Default output end of line: "
851 #if DEFAULT_NEWLINE == CR
853 #elif DEFAULT_NEWLINE == CRLF
859 " Decode MIME encoded string: "
860 #if MIME_DECODE_DEFAULT
866 " Convert JIS X 0201 Katakana: "
873 " --help, --version output: "
874 #if HELP_OUTPUT_HELP_OUTPUT
884 char *get_backup_filename(const char *suffix, const char *filename)
886 char *backup_filename;
887 int asterisk_count = 0;
889 int filename_length = strlen(filename);
891 for(i = 0; suffix[i]; i++){
892 if(suffix[i] == '*') asterisk_count++;
896 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
897 if (!backup_filename){
898 perror("Can't malloc backup filename.");
902 for(i = 0, j = 0; suffix[i];){
903 if(suffix[i] == '*'){
904 backup_filename[j] = '\0';
905 strncat(backup_filename, filename, filename_length);
907 j += filename_length;
909 backup_filename[j++] = suffix[i++];
912 backup_filename[j] = '\0';
914 j = strlen(suffix) + filename_length;
915 backup_filename = malloc( + 1);
916 strcpy(backup_filename, filename);
917 strcat(backup_filename, suffix);
918 backup_filename[j] = '\0';
920 return backup_filename;
924 #ifdef UTF8_INPUT_ENABLE
925 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
932 (*f)(0, bin2hex(c>>shift));
942 void encode_fallback_html(nkf_char c)
947 if(c >= NKF_INT32_C(1000000))
948 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
949 if(c >= NKF_INT32_C(100000))
950 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
952 (*oconv)(0, 0x30+(c/10000 )%10);
954 (*oconv)(0, 0x30+(c/1000 )%10);
956 (*oconv)(0, 0x30+(c/100 )%10);
958 (*oconv)(0, 0x30+(c/10 )%10);
960 (*oconv)(0, 0x30+ c %10);
965 void encode_fallback_xml(nkf_char c)
970 nkf_each_char_to_hex(oconv, c);
975 void encode_fallback_java(nkf_char c)
979 if(!nkf_char_unicode_bmp_p(c)){
983 (*oconv)(0, bin2hex(c>>20));
984 (*oconv)(0, bin2hex(c>>16));
988 (*oconv)(0, bin2hex(c>>12));
989 (*oconv)(0, bin2hex(c>> 8));
990 (*oconv)(0, bin2hex(c>> 4));
991 (*oconv)(0, bin2hex(c ));
995 void encode_fallback_perl(nkf_char c)
1000 nkf_each_char_to_hex(oconv, c);
1005 void encode_fallback_subchar(nkf_char c)
1007 c = unicode_subchar;
1008 (*oconv)((c>>8)&0xFF, c&0xFF);
1013 static const struct {
1037 {"katakana-hiragana","h3"},
1045 #ifdef UTF8_OUTPUT_ENABLE
1055 {"fb-subchar=", ""},
1057 #ifdef UTF8_INPUT_ENABLE
1058 {"utf8-input", "W"},
1059 {"utf16-input", "W16"},
1060 {"no-cp932ext", ""},
1061 {"no-best-fit-chars",""},
1063 #ifdef UNICODE_NORMALIZATION
1064 {"utf8mac-input", ""},
1076 #ifdef NUMCHAR_OPTION
1077 {"numchar-input", ""},
1083 #ifdef SHIFTJIS_CP932
1093 static void set_input_encoding(nkf_encoding *enc)
1095 switch (nkf_enc_to_index(enc)) {
1102 #ifdef SHIFTJIS_CP932
1105 #ifdef UTF8_OUTPUT_ENABLE
1106 ms_ucs_map_f = UCS_MAP_CP932;
1116 case ISO_2022_JP_2004:
1123 #ifdef SHIFTJIS_CP932
1126 #ifdef UTF8_OUTPUT_ENABLE
1127 ms_ucs_map_f = UCS_MAP_CP932;
1132 #ifdef SHIFTJIS_CP932
1135 #ifdef UTF8_OUTPUT_ENABLE
1136 ms_ucs_map_f = UCS_MAP_CP10001;
1144 #ifdef SHIFTJIS_CP932
1147 #ifdef UTF8_OUTPUT_ENABLE
1148 ms_ucs_map_f = UCS_MAP_CP932;
1152 #ifdef SHIFTJIS_CP932
1155 #ifdef UTF8_OUTPUT_ENABLE
1156 ms_ucs_map_f = UCS_MAP_MS;
1160 #ifdef SHIFTJIS_CP932
1163 #ifdef UTF8_OUTPUT_ENABLE
1164 ms_ucs_map_f = UCS_MAP_ASCII;
1167 case SHIFT_JISX0213:
1168 case SHIFT_JIS_2004:
1170 #ifdef SHIFTJIS_CP932
1177 #ifdef SHIFTJIS_CP932
1181 #ifdef UTF8_INPUT_ENABLE
1182 #ifdef UNICODE_NORMALIZATION
1190 input_endian = ENDIAN_BIG;
1194 input_endian = ENDIAN_LITTLE;
1199 input_endian = ENDIAN_BIG;
1203 input_endian = ENDIAN_LITTLE;
1209 static void set_output_encoding(nkf_encoding *enc)
1211 switch (nkf_enc_to_index(enc)) {
1214 #ifdef SHIFTJIS_CP932
1215 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1217 #ifdef UTF8_OUTPUT_ENABLE
1218 ms_ucs_map_f = UCS_MAP_CP932;
1222 #ifdef SHIFTJIS_CP932
1223 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1225 #ifdef UTF8_OUTPUT_ENABLE
1226 ms_ucs_map_f = UCS_MAP_CP932;
1231 #ifdef SHIFTJIS_CP932
1232 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1238 #ifdef SHIFTJIS_CP932
1239 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1245 #ifdef UTF8_OUTPUT_ENABLE
1246 ms_ucs_map_f = UCS_MAP_CP932;
1250 #ifdef UTF8_OUTPUT_ENABLE
1251 ms_ucs_map_f = UCS_MAP_CP10001;
1256 #ifdef SHIFTJIS_CP932
1257 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1259 #ifdef UTF8_OUTPUT_ENABLE
1260 ms_ucs_map_f = UCS_MAP_ASCII;
1265 #ifdef SHIFTJIS_CP932
1266 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1268 #ifdef UTF8_OUTPUT_ENABLE
1269 ms_ucs_map_f = UCS_MAP_ASCII;
1273 #ifdef SHIFTJIS_CP932
1274 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1276 #ifdef UTF8_OUTPUT_ENABLE
1277 ms_ucs_map_f = UCS_MAP_CP932;
1282 #ifdef UTF8_OUTPUT_ENABLE
1283 ms_ucs_map_f = UCS_MAP_MS;
1288 #ifdef UTF8_OUTPUT_ENABLE
1289 ms_ucs_map_f = UCS_MAP_ASCII;
1292 case SHIFT_JISX0213:
1293 case SHIFT_JIS_2004:
1295 #ifdef SHIFTJIS_CP932
1296 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1303 #ifdef SHIFTJIS_CP932
1304 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1307 #ifdef UTF8_OUTPUT_ENABLE
1309 output_bom_f = TRUE;
1313 output_bom_f = TRUE;
1316 output_endian = ENDIAN_LITTLE;
1317 output_bom_f = FALSE;
1320 output_endian = ENDIAN_LITTLE;
1321 output_bom_f = TRUE;
1324 output_bom_f = TRUE;
1327 output_endian = ENDIAN_LITTLE;
1328 output_bom_f = FALSE;
1331 output_endian = ENDIAN_LITTLE;
1332 output_bom_f = TRUE;
1338 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1341 struct input_code *p = input_code_list;
1343 if (iconv_func == p->iconv_func){
1352 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1354 #ifdef INPUT_CODE_FIX
1355 if (f || !input_encoding)
1362 #ifdef INPUT_CODE_FIX
1363 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1369 if (estab_f && iconv_for_check != iconv){
1370 struct input_code *p = find_inputcode_byfunc(iconv);
1372 set_input_codename(p->name);
1375 iconv_for_check = iconv;
1381 nkf_char x0212_shift(nkf_char c)
1386 if (0x75 <= c && c <= 0x7f){
1387 ret = c + (0x109 - 0x75);
1390 if (0x75 <= c && c <= 0x7f){
1391 ret = c + (0x113 - 0x75);
1398 nkf_char x0212_unshift(nkf_char c)
1401 if (0x7f <= c && c <= 0x88){
1402 ret = c + (0x75 - 0x7f);
1403 }else if (0x89 <= c && c <= 0x92){
1404 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1408 #endif /* X0212_ENABLE */
1410 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1416 if((0x21 <= ndx && ndx <= 0x2F)){
1417 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1418 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1420 }else if(0x6E <= ndx && ndx <= 0x7E){
1421 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1422 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1428 else if(nkf_isgraph(ndx)){
1430 const unsigned short *ptr;
1431 ptr = x0212_shiftjis[ndx - 0x21];
1433 val = ptr[(c1 & 0x7f) - 0x21];
1442 c2 = x0212_shift(c2);
1444 #endif /* X0212_ENABLE */
1446 if(0x7F < c2) return 1;
1447 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1448 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1452 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1454 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1457 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1458 #ifdef SHIFTJIS_CP932
1459 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1460 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1467 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1468 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1474 #endif /* SHIFTJIS_CP932 */
1476 if (!x0213_f && is_ibmext_in_sjis(c2)){
1477 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1480 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1493 if(x0213_f && c2 >= 0xF0){
1494 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1495 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1496 }else{ /* 78<=k<=94 */
1497 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1498 if (0x9E < c1) c2++;
1501 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1502 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1503 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1504 if (0x9E < c1) c2++;
1507 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1514 c2 = x0212_unshift(c2);
1521 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1522 void nkf_unicode_to_utf8(nkf_char val, int *p1, int *p2, int *p3, int *p4)
1530 }else if (val < 0x800){
1531 *p1 = 0xc0 | (val >> 6);
1532 *p2 = 0x80 | (val & 0x3f);
1535 } else if (nkf_char_unicode_bmp_p(val)) {
1536 *p1 = 0xe0 | (val >> 12);
1537 *p2 = 0x80 | ((val >> 6) & 0x3f);
1538 *p3 = 0x80 | ( val & 0x3f);
1540 } else if (nkf_char_unicode_value_p(val)) {
1541 *p1 = 0xe0 | (val >> 16);
1542 *p2 = 0x80 | ((val >> 12) & 0x3f);
1543 *p3 = 0x80 | ((val >> 6) & 0x3f);
1544 *p4 = 0x80 | ( val & 0x3f);
1553 nkf_char nkf_utf8_to_unicode(int c1, int c2, int c3, int c4)
1560 else if (c1 <= 0xC3) {
1561 /* trail byte or invalid */
1564 else if (c1 <= 0xDF) {
1566 wc = (c1 & 0x1F) << 6;
1569 else if (c1 <= 0xEF) {
1571 wc = (c1 & 0x0F) << 12;
1572 wc |= (c2 & 0x3F) << 6;
1575 else if (c2 <= 0xF4) {
1577 wc = (c1 & 0x0F) << 18;
1578 wc |= (c2 & 0x3F) << 12;
1579 wc |= (c3 & 0x3F) << 6;
1589 #ifdef UTF8_INPUT_ENABLE
1590 static int unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1591 const unsigned short *const *pp, nkf_char psize,
1592 nkf_char *p2, nkf_char *p1)
1595 const unsigned short *p;
1598 if (pp == 0) return 1;
1601 if (c1 < 0 || psize <= c1) return 1;
1603 if (p == 0) return 1;
1606 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1608 if (val == 0) return 1;
1609 if (no_cp932ext_f && (
1610 (val>>8) == 0x2D || /* NEC special characters */
1611 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1619 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1626 static nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1628 const unsigned short *const *pp;
1629 const unsigned short *const *const *ppp;
1630 static const char no_best_fit_chars_table_C2[] =
1631 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1632 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1633 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1634 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1635 static const char no_best_fit_chars_table_C2_ms[] =
1636 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1637 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1638 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1639 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1640 static const char no_best_fit_chars_table_932_C2[] =
1641 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1643 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1644 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1645 static const char no_best_fit_chars_table_932_C3[] =
1646 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1647 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1648 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1649 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1655 }else if(c2 < 0xe0){
1656 if(no_best_fit_chars_f){
1657 if(ms_ucs_map_f == UCS_MAP_CP932){
1660 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1663 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1666 }else if(!cp932inv_f){
1669 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1672 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1675 }else if(ms_ucs_map_f == UCS_MAP_MS){
1676 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1677 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1695 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1696 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1697 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1699 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1700 }else if(c0 < 0xF0){
1701 if(no_best_fit_chars_f){
1702 if(ms_ucs_map_f == UCS_MAP_CP932){
1703 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1704 }else if(ms_ucs_map_f == UCS_MAP_MS){
1709 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1712 if(c0 == 0x92) return 1;
1717 if(c1 == 0x80 || c0 == 0x9C) return 1;
1720 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1725 if(c0 == 0x94) return 1;
1728 if(c0 == 0xBB) return 1;
1738 if(c0 == 0x95) return 1;
1741 if(c0 == 0xA5) return 1;
1748 if(c0 == 0x8D) return 1;
1751 if(c0 == 0x9E && !cp932inv_f) return 1;
1754 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1762 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1763 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1764 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1766 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1768 #ifdef SHIFTJIS_CP932
1769 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1771 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1772 s2e_conv(s2, s1, p2, p1);
1781 #ifdef UTF8_OUTPUT_ENABLE
1782 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
1784 const unsigned short *p;
1786 if (c2 == JIS_X_0201_1976_K) {
1787 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1795 p = euc_to_utf8_1byte;
1797 } else if (is_eucg3(c2)){
1798 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1801 c2 = (c2&0x7f) - 0x21;
1802 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1803 p = x0212_to_utf8_2bytes[c2];
1809 c2 = (c2&0x7f) - 0x21;
1810 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1812 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1813 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1814 euc_to_utf8_2bytes_ms[c2];
1819 c1 = (c1 & 0x7f) - 0x21;
1820 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1826 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1833 }else if (0xc0 <= c2 && c2 <= 0xef) {
1834 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1835 #ifdef NUMCHAR_OPTION
1838 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1846 #ifdef UTF8_INPUT_ENABLE
1847 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1856 else if (nkf_char_unicode_bmp_p(val)){
1857 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1858 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1861 *p1 = nkf_char_unicode_new(val);
1867 *p1 = nkf_char_unicode_new(val);
1873 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1875 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1876 if (iso2022jp_f && !x0201_f) {
1877 c2 = GETA1; c1 = GETA2;
1879 c2 = JIS_X_0201_1976_K;
1883 }else if (c2 == 0x8f){
1887 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
1888 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1889 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
1892 c2 = (c2 << 8) | (c1 & 0x7f);
1894 #ifdef SHIFTJIS_CP932
1897 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1898 s2e_conv(s2, s1, &c2, &c1);
1905 #endif /* SHIFTJIS_CP932 */
1907 #endif /* X0212_ENABLE */
1908 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
1911 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
1912 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
1913 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
1918 #ifdef SHIFTJIS_CP932
1919 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
1921 if (e2s_conv(c2, c1, &s2, &s1) == 0){
1922 s2e_conv(s2, s1, &c2, &c1);
1929 #endif /* SHIFTJIS_CP932 */
1936 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1938 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
1939 if (iso2022jp_f && !x0201_f) {
1940 c2 = GETA1; c1 = GETA2;
1944 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
1946 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
1948 if(c1 == 0x7F) return 0;
1949 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
1952 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
1953 if (ret) return ret;
1959 nkf_char w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
1961 nkf_char ret = 0, c4 = 0;
1962 static const char w_iconv_utf8_1st_byte[] =
1964 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1965 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
1966 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
1967 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
1974 if (c1 < 0 || 0xff < c1) {
1975 }else if (c1 == 0) { /* 0 : 1 byte*/
1977 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
1980 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
1982 if (c2 < 0x80 || 0xBF < c2) return 0;
1985 if (c3 == 0) return -1;
1986 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
1991 if (c3 == 0) return -1;
1992 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
1996 if (c3 == 0) return -1;
1997 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2001 if (c3 == 0) return -2;
2002 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2006 if (c3 == 0) return -2;
2007 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2011 if (c3 == 0) return -2;
2012 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2020 if (c1 == 0 || c1 == EOF){
2021 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2022 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2025 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2033 #define NKF_ICONV_INVALID_CODE_RANGE -13
2034 static size_t unicode_iconv(nkf_char wc)
2042 }else if ((wc>>11) == 27) {
2043 /* unpaired surrogate */
2044 return NKF_ICONV_INVALID_CODE_RANGE;
2045 }else if (wc < 0xFFFF) {
2046 ret = w16e_conv(wc, &c2, &c1);
2047 if (ret) return ret;
2048 }else if (wc < 0x10FFFF) {
2050 c1 = nkf_char_unicode_new(wc);
2052 return NKF_ICONV_INVALID_CODE_RANGE;
2058 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2059 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2060 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2061 size_t nkf_iconv_utf_16(int c1, int c2, int c3, int c4)
2070 if (input_endian == ENDIAN_BIG) {
2071 if (0xD8 <= c1 && c1 <= 0xDB) {
2072 if (0xDC <= c3 && c3 <= 0xDF) {
2073 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2074 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2079 if (0xD8 <= c2 && c2 <= 0xDB) {
2080 if (0xDC <= c4 && c4 <= 0xDF) {
2081 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2082 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2088 return (*unicode_iconv)(wc);
2091 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2096 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2101 size_t nkf_iconv_utf_32(int c1, int c2, int c3, int c4)
2110 switch(input_endian){
2112 wc = c2 << 16 | c3 << 8 | c4;
2115 wc = c3 << 16 | c2 << 8 | c1;
2118 wc = c1 << 16 | c4 << 8 | c3;
2121 wc = c4 << 16 | c1 << 8 | c2;
2124 return NKF_ICONV_INVALID_CODE_RANGE;
2127 return (*unicode_iconv)(wc);
2131 #define output_ascii_escape_sequence(mode) do { \
2132 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2135 (*o_putc)(ascii_intro); \
2136 output_mode = mode; \
2140 void output_escape_sequence(int mode)
2142 if (output_mode == mode)
2150 case JIS_X_0201_1976_K:
2158 (*o_putc)(kanji_intro);
2182 void j_oconv(nkf_char c2, nkf_char c1)
2184 #ifdef NUMCHAR_OPTION
2185 if (c2 == 0 && nkf_char_unicode_p(c1)){
2186 w16e_conv(c1, &c2, &c1);
2187 if (c2 == 0 && nkf_char_unicode_p(c1)){
2188 c2 = c1 & VALUE_MASK;
2189 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2192 c2 = 0x7F + c1 / 94;
2193 c1 = 0x21 + c1 % 94;
2195 if (encode_fallback) (*encode_fallback)(c1);
2202 output_ascii_escape_sequence(ASCII);
2205 else if (c2 == EOF) {
2206 output_ascii_escape_sequence(ASCII);
2209 else if (c2 == ISO_8859_1) {
2210 output_ascii_escape_sequence(ISO_8859_1);
2213 else if (c2 == JIS_X_0201_1976_K) {
2214 output_escape_sequence(JIS_X_0201_1976_K);
2217 } else if (is_eucg3(c2)){
2218 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2219 (*o_putc)(c2 & 0x7f);
2224 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2225 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2226 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2232 void e_oconv(nkf_char c2, nkf_char c1)
2234 if (c2 == 0 && nkf_char_unicode_p(c1)){
2235 w16e_conv(c1, &c2, &c1);
2236 if (c2 == 0 && nkf_char_unicode_p(c1)){
2237 c2 = c1 & VALUE_MASK;
2238 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2242 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2243 c1 = 0x21 + c1 % 94;
2246 (*o_putc)((c2 & 0x7f) | 0x080);
2247 (*o_putc)(c1 | 0x080);
2249 (*o_putc)((c2 & 0x7f) | 0x080);
2250 (*o_putc)(c1 | 0x080);
2254 if (encode_fallback) (*encode_fallback)(c1);
2262 } else if (c2 == 0) {
2263 output_mode = ASCII;
2265 } else if (c2 == JIS_X_0201_1976_K) {
2266 output_mode = EUC_JP;
2267 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2268 } else if (c2 == ISO_8859_1) {
2269 output_mode = ISO_8859_1;
2270 (*o_putc)(c1 | 0x080);
2272 } else if (is_eucg3(c2)){
2273 output_mode = EUC_JP;
2274 #ifdef SHIFTJIS_CP932
2277 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2278 s2e_conv(s2, s1, &c2, &c1);
2283 output_mode = ASCII;
2285 }else if (is_eucg3(c2)){
2288 (*o_putc)((c2 & 0x7f) | 0x080);
2289 (*o_putc)(c1 | 0x080);
2292 (*o_putc)((c2 & 0x7f) | 0x080);
2293 (*o_putc)(c1 | 0x080);
2297 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2298 set_iconv(FALSE, 0);
2299 return; /* too late to rescue this char */
2301 output_mode = EUC_JP;
2302 (*o_putc)(c2 | 0x080);
2303 (*o_putc)(c1 | 0x080);
2307 void s_oconv(nkf_char c2, nkf_char c1)
2309 #ifdef NUMCHAR_OPTION
2310 if (c2 == 0 && nkf_char_unicode_p(c1)){
2311 w16e_conv(c1, &c2, &c1);
2312 if (c2 == 0 && nkf_char_unicode_p(c1)){
2313 c2 = c1 & VALUE_MASK;
2314 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2317 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2319 c1 += 0x40 + (c1 > 0x3e);
2324 if(encode_fallback)(*encode_fallback)(c1);
2333 } else if (c2 == 0) {
2334 output_mode = ASCII;
2336 } else if (c2 == JIS_X_0201_1976_K) {
2337 output_mode = SHIFT_JIS;
2339 } else if (c2 == ISO_8859_1) {
2340 output_mode = ISO_8859_1;
2341 (*o_putc)(c1 | 0x080);
2343 } else if (is_eucg3(c2)){
2344 output_mode = SHIFT_JIS;
2345 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2351 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2352 set_iconv(FALSE, 0);
2353 return; /* too late to rescue this char */
2355 output_mode = SHIFT_JIS;
2356 e2s_conv(c2, c1, &c2, &c1);
2358 #ifdef SHIFTJIS_CP932
2360 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2361 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2367 #endif /* SHIFTJIS_CP932 */
2370 if (prefix_table[(unsigned char)c1]){
2371 (*o_putc)(prefix_table[(unsigned char)c1]);
2377 #ifdef UTF8_OUTPUT_ENABLE
2378 void w_oconv(nkf_char c2, nkf_char c1)
2384 output_bom_f = FALSE;
2395 if (c2 == 0 && nkf_char_unicode_p(c1)){
2396 val = c1 & VALUE_MASK;
2397 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2399 if (c2) (*o_putc)(c2);
2400 if (c3) (*o_putc)(c3);
2401 if (c4) (*o_putc)(c4);
2408 val = e2w_conv(c2, c1);
2410 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2412 if (c2) (*o_putc)(c2);
2413 if (c3) (*o_putc)(c3);
2414 if (c4) (*o_putc)(c4);
2419 void w_oconv16(nkf_char c2, nkf_char c1)
2422 output_bom_f = FALSE;
2423 if (output_endian == ENDIAN_LITTLE){
2437 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2438 if (nkf_char_unicode_bmp_p(c1)) {
2439 c2 = (c1 >> 8) & 0xff;
2443 if (c1 <= UNICODE_MAX) {
2444 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2445 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2446 if (output_endian == ENDIAN_LITTLE){
2447 (*o_putc)(c2 & 0xff);
2448 (*o_putc)((c2 >> 8) & 0xff);
2449 (*o_putc)(c1 & 0xff);
2450 (*o_putc)((c1 >> 8) & 0xff);
2452 (*o_putc)((c2 >> 8) & 0xff);
2453 (*o_putc)(c2 & 0xff);
2454 (*o_putc)((c1 >> 8) & 0xff);
2455 (*o_putc)(c1 & 0xff);
2461 nkf_char val = e2w_conv(c2, c1);
2462 c2 = (val >> 8) & 0xff;
2466 if (output_endian == ENDIAN_LITTLE){
2475 void w_oconv32(nkf_char c2, nkf_char c1)
2478 output_bom_f = FALSE;
2479 if (output_endian == ENDIAN_LITTLE){
2497 if (c2 == ISO_8859_1) {
2499 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2502 c1 = e2w_conv(c2, c1);
2505 if (output_endian == ENDIAN_LITTLE){
2506 (*o_putc)( c1 & 0xFF);
2507 (*o_putc)((c1 >> 8) & 0xFF);
2508 (*o_putc)((c1 >> 16) & 0xFF);
2512 (*o_putc)((c1 >> 16) & 0xFF);
2513 (*o_putc)((c1 >> 8) & 0xFF);
2514 (*o_putc)( c1 & 0xFF);
2519 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2520 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2521 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2522 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2523 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2524 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2525 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2526 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2528 #define SCORE_INIT (SCORE_iMIME)
2530 static const char score_table_A0[] = {
2533 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2534 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2537 static const char score_table_F0[] = {
2538 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2539 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2540 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2541 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2544 void set_code_score(struct input_code *ptr, nkf_char score)
2547 ptr->score |= score;
2551 void clr_code_score(struct input_code *ptr, nkf_char score)
2554 ptr->score &= ~score;
2558 void code_score(struct input_code *ptr)
2560 nkf_char c2 = ptr->buf[0];
2561 #ifdef UTF8_OUTPUT_ENABLE
2562 nkf_char c1 = ptr->buf[1];
2565 set_code_score(ptr, SCORE_ERROR);
2566 }else if (c2 == SS2){
2567 set_code_score(ptr, SCORE_KANA);
2568 }else if (c2 == 0x8f){
2569 set_code_score(ptr, SCORE_X0212);
2570 #ifdef UTF8_OUTPUT_ENABLE
2571 }else if (!e2w_conv(c2, c1)){
2572 set_code_score(ptr, SCORE_NO_EXIST);
2574 }else if ((c2 & 0x70) == 0x20){
2575 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2576 }else if ((c2 & 0x70) == 0x70){
2577 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2578 }else if ((c2 & 0x70) >= 0x50){
2579 set_code_score(ptr, SCORE_L2);
2583 void status_disable(struct input_code *ptr)
2588 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2591 void status_push_ch(struct input_code *ptr, nkf_char c)
2593 ptr->buf[ptr->index++] = c;
2596 void status_clear(struct input_code *ptr)
2602 void status_reset(struct input_code *ptr)
2605 ptr->score = SCORE_INIT;
2608 void status_reinit(struct input_code *ptr)
2611 ptr->_file_stat = 0;
2614 void status_check(struct input_code *ptr, nkf_char c)
2616 if (c <= DEL && estab_f){
2621 void s_status(struct input_code *ptr, nkf_char c)
2625 status_check(ptr, c);
2630 }else if (nkf_char_unicode_p(c)){
2632 }else if (0xa1 <= c && c <= 0xdf){
2633 status_push_ch(ptr, SS2);
2634 status_push_ch(ptr, c);
2637 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2639 status_push_ch(ptr, c);
2640 }else if (0xed <= c && c <= 0xee){
2642 status_push_ch(ptr, c);
2643 #ifdef SHIFTJIS_CP932
2644 }else if (is_ibmext_in_sjis(c)){
2646 status_push_ch(ptr, c);
2647 #endif /* SHIFTJIS_CP932 */
2649 }else if (0xf0 <= c && c <= 0xfc){
2651 status_push_ch(ptr, c);
2652 #endif /* X0212_ENABLE */
2654 status_disable(ptr);
2658 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2659 status_push_ch(ptr, c);
2660 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2664 status_disable(ptr);
2668 #ifdef SHIFTJIS_CP932
2669 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2670 status_push_ch(ptr, c);
2671 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2672 set_code_score(ptr, SCORE_CP932);
2677 #endif /* SHIFTJIS_CP932 */
2678 status_disable(ptr);
2681 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2682 status_push_ch(ptr, c);
2683 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2684 set_code_score(ptr, SCORE_CP932);
2687 status_disable(ptr);
2693 void e_status(struct input_code *ptr, nkf_char c)
2697 status_check(ptr, c);
2702 }else if (nkf_char_unicode_p(c)){
2704 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2706 status_push_ch(ptr, c);
2708 }else if (0x8f == c){
2710 status_push_ch(ptr, c);
2711 #endif /* X0212_ENABLE */
2713 status_disable(ptr);
2717 if (0xa1 <= c && c <= 0xfe){
2718 status_push_ch(ptr, c);
2722 status_disable(ptr);
2727 if (0xa1 <= c && c <= 0xfe){
2729 status_push_ch(ptr, c);
2731 status_disable(ptr);
2733 #endif /* X0212_ENABLE */
2737 #ifdef UTF8_INPUT_ENABLE
2738 void w_status(struct input_code *ptr, nkf_char c)
2742 status_check(ptr, c);
2747 }else if (nkf_char_unicode_p(c)){
2749 }else if (0xc0 <= c && c <= 0xdf){
2751 status_push_ch(ptr, c);
2752 }else if (0xe0 <= c && c <= 0xef){
2754 status_push_ch(ptr, c);
2755 }else if (0xf0 <= c && c <= 0xf4){
2757 status_push_ch(ptr, c);
2759 status_disable(ptr);
2764 if (0x80 <= c && c <= 0xbf){
2765 status_push_ch(ptr, c);
2766 if (ptr->index > ptr->stat){
2767 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2768 && ptr->buf[2] == 0xbf);
2769 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2770 &ptr->buf[0], &ptr->buf[1]);
2777 status_disable(ptr);
2781 if (0x80 <= c && c <= 0xbf){
2782 if (ptr->index < ptr->stat){
2783 status_push_ch(ptr, c);
2788 status_disable(ptr);
2795 void code_status(nkf_char c)
2797 int action_flag = 1;
2798 struct input_code *result = 0;
2799 struct input_code *p = input_code_list;
2801 if (!p->status_func) {
2805 if (!p->status_func)
2807 (p->status_func)(p, c);
2810 }else if(p->stat == 0){
2821 if (result && !estab_f){
2822 set_iconv(TRUE, result->iconv_func);
2823 }else if (c <= DEL){
2824 struct input_code *ptr = input_code_list;
2834 nkf_char std_getc(FILE *f)
2837 return std_gc_buf[--std_gc_ndx];
2843 nkf_char std_ungetc(nkf_char c, FILE *f)
2845 if (std_gc_ndx == STD_GC_BUFSIZE){
2848 std_gc_buf[std_gc_ndx++] = c;
2853 void std_putc(nkf_char c)
2860 static unsigned char hold_buf[HOLD_SIZE*2];
2861 static int hold_count = 0;
2862 nkf_char push_hold_buf(nkf_char c2)
2864 if (hold_count >= HOLD_SIZE*2)
2866 hold_buf[hold_count++] = (unsigned char)c2;
2867 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
2870 static int h_conv(FILE *f, int c1, int c2)
2876 /** it must NOT be in the kanji shifte sequence */
2877 /** it must NOT be written in JIS7 */
2878 /** and it must be after 2 byte 8bit code */
2884 while ((c2 = (*i_getc)(f)) != EOF) {
2890 if (push_hold_buf(c2) == EOF || estab_f) {
2896 struct input_code *p = input_code_list;
2897 struct input_code *result = p;
2902 if (p->status_func && p->score < result->score) {
2907 set_iconv(TRUE, result->iconv_func);
2912 ** 1) EOF is detected, or
2913 ** 2) Code is established, or
2914 ** 3) Buffer is FULL (but last word is pushed)
2916 ** in 1) and 3) cases, we continue to use
2917 ** Kanji codes by oconv and leave estab_f unchanged.
2922 while (hold_index < hold_count){
2923 c1 = hold_buf[hold_index++];
2927 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
2928 (*iconv)(JIS_X_0201_1976_K, c1, 0);
2931 if (hold_index < hold_count){
2932 c2 = hold_buf[hold_index++];
2942 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
2945 if (hold_index < hold_count){
2946 c3 = hold_buf[hold_index++];
2947 } else if ((c3 = (*i_getc)(f)) == EOF) {
2952 if (hold_index < hold_count){
2953 c4 = hold_buf[hold_index++];
2954 } else if ((c4 = (*i_getc)(f)) == EOF) {
2959 (*iconv)(c1, c2, (c3<<8)|c4);
2964 /* 3 bytes EUC or UTF-8 */
2965 if (hold_index < hold_count){
2966 c3 = hold_buf[hold_index++];
2967 } else if ((c3 = (*i_getc)(f)) == EOF) {
2973 (*iconv)(c1, c2, c3);
2976 if (c3 == EOF) break;
2982 * Check and Ignore BOM
2984 void check_bom(FILE *f)
2987 switch(c2 = (*i_getc)(f)){
2989 if((c2 = (*i_getc)(f)) == 0x00){
2990 if((c2 = (*i_getc)(f)) == 0xFE){
2991 if((c2 = (*i_getc)(f)) == 0xFF){
2992 if(!input_encoding){
2993 set_iconv(TRUE, w_iconv32);
2995 if (iconv == w_iconv32) {
2996 input_endian = ENDIAN_BIG;
2999 (*i_ungetc)(0xFF,f);
3000 }else (*i_ungetc)(c2,f);
3001 (*i_ungetc)(0xFE,f);
3002 }else if(c2 == 0xFF){
3003 if((c2 = (*i_getc)(f)) == 0xFE){
3004 if(!input_encoding){
3005 set_iconv(TRUE, w_iconv32);
3007 if (iconv == w_iconv32) {
3008 input_endian = ENDIAN_2143;
3011 (*i_ungetc)(0xFF,f);
3012 }else (*i_ungetc)(c2,f);
3013 (*i_ungetc)(0xFF,f);
3014 }else (*i_ungetc)(c2,f);
3015 (*i_ungetc)(0x00,f);
3016 }else (*i_ungetc)(c2,f);
3017 (*i_ungetc)(0x00,f);
3020 if((c2 = (*i_getc)(f)) == 0xBB){
3021 if((c2 = (*i_getc)(f)) == 0xBF){
3022 if(!input_encoding){
3023 set_iconv(TRUE, w_iconv);
3025 if (iconv == w_iconv) {
3028 (*i_ungetc)(0xBF,f);
3029 }else (*i_ungetc)(c2,f);
3030 (*i_ungetc)(0xBB,f);
3031 }else (*i_ungetc)(c2,f);
3032 (*i_ungetc)(0xEF,f);
3035 if((c2 = (*i_getc)(f)) == 0xFF){
3036 if((c2 = (*i_getc)(f)) == 0x00){
3037 if((c2 = (*i_getc)(f)) == 0x00){
3038 if(!input_encoding){
3039 set_iconv(TRUE, w_iconv32);
3041 if (iconv == w_iconv32) {
3042 input_endian = ENDIAN_3412;
3045 (*i_ungetc)(0x00,f);
3046 }else (*i_ungetc)(c2,f);
3047 (*i_ungetc)(0x00,f);
3048 }else (*i_ungetc)(c2,f);
3049 if(!input_encoding){
3050 set_iconv(TRUE, w_iconv16);
3052 if (iconv == w_iconv16) {
3053 input_endian = ENDIAN_BIG;
3056 (*i_ungetc)(0xFF,f);
3057 }else (*i_ungetc)(c2,f);
3058 (*i_ungetc)(0xFE,f);
3061 if((c2 = (*i_getc)(f)) == 0xFE){
3062 if((c2 = (*i_getc)(f)) == 0x00){
3063 if((c2 = (*i_getc)(f)) == 0x00){
3064 if(!input_encoding){
3065 set_iconv(TRUE, w_iconv32);
3067 if (iconv == w_iconv32) {
3068 input_endian = ENDIAN_LITTLE;
3071 (*i_ungetc)(0x00,f);
3072 }else (*i_ungetc)(c2,f);
3073 (*i_ungetc)(0x00,f);
3074 }else (*i_ungetc)(c2,f);
3075 if(!input_encoding){
3076 set_iconv(TRUE, w_iconv16);
3078 if (iconv == w_iconv16) {
3079 input_endian = ENDIAN_LITTLE;
3082 (*i_ungetc)(0xFE,f);
3083 }else (*i_ungetc)(c2,f);
3084 (*i_ungetc)(0xFF,f);
3098 static void init_broken_state(void)
3100 memset(&broken_state, 0, sizeof(broken_state));
3103 static void push_broken_buf(c)
3105 broken_state.buf[broken_state.count++] = c;
3108 static nkf_char pop_broken_buf(void)
3110 return broken_state.buf[--broken_state.count];
3113 nkf_char broken_getc(FILE *f)
3117 if (broken_state.count > 0) {
3118 return pop_broken_buf();
3121 if (c=='$' && broken_state.status != ESC
3122 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3124 broken_state.status = 0;
3125 if (c1=='@'|| c1=='B') {
3126 push_broken_buf(c1);
3133 } else if (c=='(' && broken_state.status != ESC
3134 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3136 broken_state.status = 0;
3137 if (c1=='J'|| c1=='B') {
3138 push_broken_buf(c1);
3146 broken_state.status = c;
3151 nkf_char broken_ungetc(nkf_char c, FILE *f)
3153 if (broken_state.count < 2)
3158 void eol_conv(nkf_char c2, nkf_char c1)
3160 if (guess_f && input_eol != EOF) {
3161 if (c2 == 0 && c1 == LF) {
3162 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3163 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3164 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3166 else if (!input_eol) input_eol = CR;
3167 else if (input_eol != CR) input_eol = EOF;
3169 if (prev_cr || (c2 == 0 && c1 == LF)) {
3171 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3172 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3174 if (c2 == 0 && c1 == CR) prev_cr = CR;
3175 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3179 Return value of fold_conv()
3181 LF add newline and output char
3182 CR add newline and output nothing
3185 1 (or else) normal output
3187 fold state in prev (previous character)
3189 >0x80 Japanese (X0208/X0201)
3194 This fold algorthm does not preserve heading space in a line.
3195 This is the main difference from fmt.
3198 #define char_size(c2,c1) (c2?2:1)
3200 void fold_conv(nkf_char c2, nkf_char c1)
3203 nkf_char fold_state;
3205 if (c1== CR && !fold_preserve_f) {
3206 fold_state=0; /* ignore cr */
3207 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3209 fold_state=0; /* ignore cr */
3210 } else if (c1== BS) {
3211 if (f_line>0) f_line--;
3213 } else if (c2==EOF && f_line != 0) { /* close open last line */
3215 } else if ((c1==LF && !fold_preserve_f)
3216 || ((c1==CR||(c1==LF&&f_prev!=CR))
3217 && fold_preserve_f)) {
3219 if (fold_preserve_f) {
3223 } else if ((f_prev == c1 && !fold_preserve_f)
3224 || (f_prev == LF && fold_preserve_f)
3225 ) { /* duplicate newline */
3228 fold_state = LF; /* output two newline */
3234 if (f_prev&0x80) { /* Japanese? */
3236 fold_state = 0; /* ignore given single newline */
3237 } else if (f_prev==SP) {
3241 if (++f_line<=fold_len)
3245 fold_state = CR; /* fold and output nothing */
3249 } else if (c1=='\f') {
3252 fold_state = LF; /* output newline and clear */
3253 } else if ( (c2==0 && c1==SP)||
3254 (c2==0 && c1==TAB)||
3255 (c2=='!'&& c1=='!')) {
3256 /* X0208 kankaku or ascii space */
3258 fold_state = 0; /* remove duplicate spaces */
3261 if (++f_line<=fold_len)
3262 fold_state = SP; /* output ASCII space only */
3264 f_prev = SP; f_line = 0;
3265 fold_state = CR; /* fold and output nothing */
3269 prev0 = f_prev; /* we still need this one... , but almost done */
3271 if (c2 || c2 == JIS_X_0201_1976_K)
3272 f_prev |= 0x80; /* this is Japanese */
3273 f_line += char_size(c2,c1);
3274 if (f_line<=fold_len) { /* normal case */
3277 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3278 f_line = char_size(c2,c1);
3279 fold_state = LF; /* We can't wait, do fold now */
3280 } else if (c2 == JIS_X_0201_1976_K) {
3281 /* simple kinsoku rules return 1 means no folding */
3282 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3283 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3284 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3285 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3286 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3287 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3288 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3290 fold_state = LF;/* add one new f_line before this character */
3293 fold_state = LF;/* add one new f_line before this character */
3296 /* kinsoku point in ASCII */
3297 if ( c1==')'|| /* { [ ( */
3308 /* just after special */
3309 } else if (!is_alnum(prev0)) {
3310 f_line = char_size(c2,c1);
3312 } else if ((prev0==SP) || /* ignored new f_line */
3313 (prev0==LF)|| /* ignored new f_line */
3314 (prev0&0x80)) { /* X0208 - ASCII */
3315 f_line = char_size(c2,c1);
3316 fold_state = LF;/* add one new f_line before this character */
3318 fold_state = 1; /* default no fold in ASCII */
3322 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3323 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3324 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3325 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3326 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3327 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3328 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3329 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3330 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3331 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3332 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3333 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3334 /* default no fold in kinsoku */
3337 f_line = char_size(c2,c1);
3338 /* add one new f_line before this character */
3341 f_line = char_size(c2,c1);
3343 /* add one new f_line before this character */
3348 /* terminator process */
3349 switch(fold_state) {
3351 OCONV_NEWLINE((*o_fconv));
3357 OCONV_NEWLINE((*o_fconv));
3368 nkf_char z_prev2=0,z_prev1=0;
3370 void z_conv(nkf_char c2, nkf_char c1)
3373 /* if (c2) c1 &= 0x7f; assertion */
3375 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3381 if (z_prev2 == JIS_X_0201_1976_K) {
3382 if (c2 == JIS_X_0201_1976_K) {
3383 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3385 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3387 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3389 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3394 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3396 if (c2 == JIS_X_0201_1976_K) {
3397 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3398 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3403 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3414 if (alpha_f&1 && c2 == 0x23) {
3415 /* JISX0208 Alphabet */
3417 } else if (c2 == 0x21) {
3418 /* JISX0208 Kigou */
3423 } else if (alpha_f&4) {
3428 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3434 if (alpha_f&8 && c2 == 0) {
3438 case '>': entity = ">"; break;
3439 case '<': entity = "<"; break;
3440 case '\"': entity = """; break;
3441 case '&': entity = "&"; break;
3444 while (*entity) (*o_zconv)(0, *entity++);
3450 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3455 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3459 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3463 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3467 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3471 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3475 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3479 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3483 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3488 (*o_zconv)(JIS_X_0201_1976_K, c);
3491 } else if (c2 == 0x25) {
3492 /* JISX0208 Katakana */
3493 static const int fullwidth_to_halfwidth[] =
3495 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3496 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3497 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3498 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3499 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3500 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3501 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3502 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3503 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3504 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3505 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3506 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3508 if (fullwidth_to_halfwidth[c1-0x20]){
3509 c2 = fullwidth_to_halfwidth[c1-0x20];
3510 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3512 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3522 #define rot13(c) ( \
3524 (c <= 'M') ? (c + 13): \
3525 (c <= 'Z') ? (c - 13): \
3527 (c <= 'm') ? (c + 13): \
3528 (c <= 'z') ? (c - 13): \
3532 #define rot47(c) ( \
3534 ( c <= 'O') ? (c + 47) : \
3535 ( c <= '~') ? (c - 47) : \
3539 void rot_conv(nkf_char c2, nkf_char c1)
3541 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3547 (*o_rot_conv)(c2,c1);
3550 void hira_conv(nkf_char c2, nkf_char c1)
3554 if (0x20 < c1 && c1 < 0x74) {
3556 (*o_hira_conv)(c2,c1);
3558 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3560 c1 = nkf_char_unicode_new(0x3094);
3561 (*o_hira_conv)(c2,c1);
3564 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3566 (*o_hira_conv)(c2,c1);
3571 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3574 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3576 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3580 (*o_hira_conv)(c2,c1);
3584 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3586 #define RANGE_NUM_MAX 18
3587 static const nkf_char range[RANGE_NUM_MAX][2] = {
3608 nkf_char start, end, c;
3610 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3614 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3619 for (i = 0; i < RANGE_NUM_MAX; i++) {
3620 start = range[i][0];
3623 if (c >= start && c <= end) {
3628 (*o_iso2022jp_check_conv)(c2,c1);
3632 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3634 static const unsigned char *mime_pattern[] = {
3635 (const unsigned char *)"\075?EUC-JP?B?",
3636 (const unsigned char *)"\075?SHIFT_JIS?B?",
3637 (const unsigned char *)"\075?ISO-8859-1?Q?",
3638 (const unsigned char *)"\075?ISO-8859-1?B?",
3639 (const unsigned char *)"\075?ISO-2022-JP?B?",
3640 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3641 #if defined(UTF8_INPUT_ENABLE)
3642 (const unsigned char *)"\075?UTF-8?B?",
3643 (const unsigned char *)"\075?UTF-8?Q?",
3645 (const unsigned char *)"\075?US-ASCII?Q?",
3650 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3651 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3652 e_iconv, s_iconv, 0, 0, 0, 0,
3653 #if defined(UTF8_INPUT_ENABLE)
3659 static const nkf_char mime_encode[] = {
3660 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3661 #if defined(UTF8_INPUT_ENABLE)
3668 static const nkf_char mime_encode_method[] = {
3669 'B', 'B','Q', 'B', 'B', 'Q',
3670 #if defined(UTF8_INPUT_ENABLE)
3678 /* MIME preprocessor fifo */
3680 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3681 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3682 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3684 unsigned char buf[MIME_BUF_SIZE];
3686 unsigned int last; /* decoded */
3687 unsigned int input; /* undecoded */
3689 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3691 #define MAXRECOVER 20
3693 static void mime_input_buf_unshift(nkf_char c)
3695 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3698 nkf_char mime_ungetc(nkf_char c, FILE *f)
3700 mime_input_buf_unshift(c);
3704 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
3707 (*i_mungetc_buf)(c,f);
3709 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3713 nkf_char mime_getc_buf(FILE *f)
3715 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3716 a terminator. It was checked in mime_integrity. */
3717 return ((mimebuf_f)?
3718 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3721 void switch_mime_getc(void)
3723 if (i_getc!=mime_getc) {
3724 i_mgetc = i_getc; i_getc = mime_getc;
3725 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3726 if(mime_f==STRICT_MIME) {
3727 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3728 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3733 void unswitch_mime_getc(void)
3735 if(mime_f==STRICT_MIME) {
3736 i_mgetc = i_mgetc_buf;
3737 i_mungetc = i_mungetc_buf;
3740 i_ungetc = i_mungetc;
3741 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3742 mime_iconv_back = NULL;
3745 nkf_char mime_integrity(FILE *f, const unsigned char *p)
3749 /* In buffered mode, read until =? or NL or buffer full
3751 mime_input_state.input = mime_input_state.top;
3752 mime_input_state.last = mime_input_state.top;
3754 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3756 q = mime_input_state.input;
3757 while((c=(*i_getc)(f))!=EOF) {
3758 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3759 break; /* buffer full */
3761 if (c=='=' && d=='?') {
3762 /* checked. skip header, start decode */
3763 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3764 /* mime_last_input = mime_input_state.input; */
3765 mime_input_state.input = q;
3769 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3771 /* Should we check length mod 4? */
3772 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3775 /* In case of Incomplete MIME, no MIME decode */
3776 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3777 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3778 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3779 switch_mime_getc(); /* anyway we need buffered getc */
3783 nkf_char mime_begin_strict(FILE *f)
3787 const unsigned char *p,*q;
3788 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3790 mime_decode_mode = FALSE;
3791 /* =? has been checked */
3793 p = mime_pattern[j];
3796 for(i=2;p[i]>SP;i++) { /* start at =? */
3797 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3798 /* pattern fails, try next one */
3800 while (mime_pattern[++j]) {
3801 p = mime_pattern[j];
3802 for(k=2;k<i;k++) /* assume length(p) > i */
3803 if (p[k]!=q[k]) break;
3804 if (k==i && nkf_toupper(c1)==p[k]) break;
3806 p = mime_pattern[j];
3807 if (p) continue; /* found next one, continue */
3808 /* all fails, output from recovery buffer */
3816 mime_decode_mode = p[i-2];
3818 mime_iconv_back = iconv;
3819 set_iconv(FALSE, mime_priority_func[j]);
3820 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3822 if (mime_decode_mode=='B') {
3823 mimebuf_f = unbuf_f;
3825 /* do MIME integrity check */
3826 return mime_integrity(f,mime_pattern[j]);
3834 nkf_char mime_begin(FILE *f)
3839 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
3840 /* re-read and convert again from mime_buffer. */
3842 /* =? has been checked */
3843 k = mime_input_state.last;
3844 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
3845 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
3846 /* We accept any character type even if it is breaked by new lines */
3847 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3848 if (c1==LF||c1==SP||c1==CR||
3849 c1=='-'||c1=='_'||is_alnum(c1)) continue;
3851 /* Failed. But this could be another MIME preemble */
3853 mime_input_state.last--;
3859 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3860 if (!(++i<MAXRECOVER) || c1==EOF) break;
3861 if (c1=='b'||c1=='B') {
3862 mime_decode_mode = 'B';
3863 } else if (c1=='q'||c1=='Q') {
3864 mime_decode_mode = 'Q';
3868 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
3869 if (!(++i<MAXRECOVER) || c1==EOF) break;
3871 mime_decode_mode = FALSE;
3877 if (!mime_decode_mode) {
3878 /* false MIME premble, restart from mime_buffer */
3879 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
3880 /* Since we are in MIME mode until buffer becomes empty, */
3881 /* we never go into mime_begin again for a while. */
3884 /* discard mime preemble, and goto MIME mode */
3885 mime_input_state.last = k;
3886 /* do no MIME integrity check */
3887 return c1; /* used only for checking EOF */
3891 void no_putc(nkf_char c)
3896 void debug(const char *str)
3899 fprintf(stderr, "%s\n", str ? str : "NULL");
3904 void set_input_codename(char *codename)
3906 if (!input_codename) {
3907 input_codename = codename;
3908 } else if (strcmp(codename, input_codename) != 0) {
3909 input_codename = "";
3913 static char* get_guessed_code(void)
3915 if (input_codename && !*input_codename) {
3916 input_codename = "BINARY";
3918 struct input_code *p = find_inputcode_byfunc(iconv);
3919 if (!input_codename) {
3920 input_codename = "ASCII";
3921 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
3922 if (p->score & (SCORE_DEPEND|SCORE_CP932))
3923 input_codename = "CP932";
3924 } else if (strcmp(input_codename, "EUC-JP") == 0) {
3925 if (p->score & (SCORE_X0212))
3926 input_codename = "EUCJP-MS";
3927 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3928 input_codename = "CP51932";
3929 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
3930 if (p->score & (SCORE_KANA))
3931 input_codename = "CP50221";
3932 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
3933 input_codename = "CP50220";
3936 return input_codename;
3939 #if !defined(PERL_XS) && !defined(WIN32DLL)
3940 void print_guessed_code(char *filename)
3942 if (filename != NULL) printf("%s: ", filename);
3943 if (input_codename && !*input_codename) {
3946 input_codename = get_guessed_code();
3948 printf("%s\n", input_codename);
3952 input_eol == CR ? " (CR)" :
3953 input_eol == LF ? " (LF)" :
3954 input_eol == CRLF ? " (CRLF)" :
3955 input_eol == EOF ? " (MIXED NL)" :
3964 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
3966 nkf_char c1, c2, c3;
3972 if (!nkf_isxdigit(c2)){
3977 if (!nkf_isxdigit(c3)){
3982 return (hex2bin(c2) << 4) | hex2bin(c3);
3985 nkf_char cap_getc(FILE *f)
3987 return hex_getc(':', f, i_cgetc, i_cungetc);
3990 nkf_char cap_ungetc(nkf_char c, FILE *f)
3992 return (*i_cungetc)(c, f);
3995 nkf_char url_getc(FILE *f)
3997 return hex_getc('%', f, i_ugetc, i_uungetc);
4000 nkf_char url_ungetc(nkf_char c, FILE *f)
4002 return (*i_uungetc)(c, f);
4006 #ifdef NUMCHAR_OPTION
4007 nkf_char numchar_getc(FILE *f)
4009 nkf_char (*g)(FILE *) = i_ngetc;
4010 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4021 if (buf[i] == 'x' || buf[i] == 'X'){
4022 for (j = 0; j < 7; j++){
4024 if (!nkf_isxdigit(buf[i])){
4031 c |= hex2bin(buf[i]);
4034 for (j = 0; j < 8; j++){
4038 if (!nkf_isdigit(buf[i])){
4045 c += hex2bin(buf[i]);
4051 return nkf_char_unicode_new(c);
4060 nkf_char numchar_ungetc(nkf_char c, FILE *f)
4062 return (*i_nungetc)(c, f);
4066 #ifdef UNICODE_NORMALIZATION
4068 /* Normalization Form C */
4069 nkf_char nfc_getc(FILE *f)
4071 nkf_char (*g)(FILE *f) = i_nfc_getc;
4072 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4073 int i=0, j, k=1, lower, upper;
4075 const unsigned char *array;
4078 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
4079 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4080 while (upper >= lower) {
4081 j = (lower+upper) / 2;
4082 array = normalization_table[j].nfd;
4083 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
4084 if (array[k] != buf[k]){
4085 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
4092 array = normalization_table[j].nfc;
4093 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4094 buf[i] = (nkf_char)(array[i]);
4105 nkf_char nfc_ungetc(nkf_char c, FILE *f)
4107 return (*i_nfc_ungetc)(c, f);
4109 #endif /* UNICODE_NORMALIZATION */
4112 static nkf_char base64decode(nkf_char c)
4117 i = c - 'A'; /* A..Z 0-25 */
4118 } else if (c == '_') {
4119 i = '?' /* 63 */ ; /* _ 63 */
4121 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4123 } else if (c > '/') {
4124 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4125 } else if (c == '+' || c == '-') {
4126 i = '>' /* 62 */ ; /* + and - 62 */
4128 i = '?' /* 63 */ ; /* / 63 */
4136 nkf_char c1, c2, c3, c4, cc;
4137 nkf_char t1, t2, t3, t4, mode, exit_mode;
4138 nkf_char lwsp_count;
4141 nkf_char lwsp_size = 128;
4143 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4144 return mime_input_buf(mime_input_state.top++);
4146 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4147 mime_decode_mode=FALSE;
4148 unswitch_mime_getc();
4149 return (*i_getc)(f);
4152 if (mimebuf_f == FIXED_MIME)
4153 exit_mode = mime_decode_mode;
4156 if (mime_decode_mode == 'Q') {
4157 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4159 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4160 if (c1<=SP || DEL<=c1) {
4161 mime_decode_mode = exit_mode; /* prepare for quit */
4164 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4168 mime_decode_mode = exit_mode; /* prepare for quit */
4169 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4170 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4171 /* end Q encoding */
4172 input_mode = exit_mode;
4174 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4175 if (lwsp_buf==NULL) {
4176 perror("can't malloc");
4179 while ((c1=(*i_getc)(f))!=EOF) {
4184 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4192 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4193 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4208 lwsp_buf[lwsp_count] = (unsigned char)c1;
4209 if (lwsp_count++>lwsp_size){
4211 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4212 if (lwsp_buf_new==NULL) {
4214 perror("can't realloc");
4217 lwsp_buf = lwsp_buf_new;
4223 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4225 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4226 i_ungetc(lwsp_buf[lwsp_count],f);
4232 if (c1=='='&&c2<SP) { /* this is soft wrap */
4233 while((c1 = (*i_mgetc)(f)) <=SP) {
4234 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4236 mime_decode_mode = 'Q'; /* still in MIME */
4237 goto restart_mime_q;
4240 mime_decode_mode = 'Q'; /* still in MIME */
4244 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4245 if (c2<=SP) return c2;
4246 mime_decode_mode = 'Q'; /* still in MIME */
4247 return ((hex2bin(c2)<<4) + hex2bin(c3));
4250 if (mime_decode_mode != 'B') {
4251 mime_decode_mode = FALSE;
4252 return (*i_mgetc)(f);
4256 /* Base64 encoding */
4258 MIME allows line break in the middle of
4259 Base64, but we are very pessimistic in decoding
4260 in unbuf mode because MIME encoded code may broken by
4261 less or editor's control sequence (such as ESC-[-K in unbuffered
4262 mode. ignore incomplete MIME.
4264 mode = mime_decode_mode;
4265 mime_decode_mode = exit_mode; /* prepare for quit */
4267 while ((c1 = (*i_mgetc)(f))<=SP) {
4272 if ((c2 = (*i_mgetc)(f))<=SP) {
4275 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4276 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4279 if ((c1 == '?') && (c2 == '=')) {
4282 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
4283 if (lwsp_buf==NULL) {
4284 perror("can't malloc");
4287 while ((c1=(*i_getc)(f))!=EOF) {
4292 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4300 if ((c1=(*i_getc)(f))!=EOF) {
4304 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4319 lwsp_buf[lwsp_count] = (unsigned char)c1;
4320 if (lwsp_count++>lwsp_size){
4322 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4323 if (lwsp_buf_new==NULL) {
4325 perror("can't realloc");
4328 lwsp_buf = lwsp_buf_new;
4334 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4336 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4337 i_ungetc(lwsp_buf[lwsp_count],f);
4344 if ((c3 = (*i_mgetc)(f))<=SP) {
4347 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4348 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4352 if ((c4 = (*i_mgetc)(f))<=SP) {
4355 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4356 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4360 mime_decode_mode = mode; /* still in MIME sigh... */
4362 /* BASE 64 decoding */
4364 t1 = 0x3f & base64decode(c1);
4365 t2 = 0x3f & base64decode(c2);
4366 t3 = 0x3f & base64decode(c3);
4367 t4 = 0x3f & base64decode(c4);
4368 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4370 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4371 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4373 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4374 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4376 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4381 return mime_input_buf(mime_input_state.top++);
4384 static const char basis_64[] =
4385 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4387 #define MIMEOUT_BUF_LENGTH (60)
4389 char buf[MIMEOUT_BUF_LENGTH+1];
4394 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4396 static void open_mime(nkf_char mode)
4398 const unsigned char *p;
4401 p = mime_pattern[0];
4402 for(i=0;mime_pattern[i];i++) {
4403 if (mode == mime_encode[i]) {
4404 p = mime_pattern[i];
4408 mimeout_mode = mime_encode_method[i];
4410 if (base64_count>45) {
4411 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4412 (*o_mputc)(mimeout_state.buf[i]);
4415 PUT_NEWLINE((*o_mputc));
4418 if (mimeout_state.count>0
4419 && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4420 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) {
4424 for (;i<mimeout_state.count;i++) {
4425 if (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4426 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) {
4427 (*o_mputc)(mimeout_state.buf[i]);
4437 j = mimeout_state.count;
4438 mimeout_state.count = 0;
4440 mime_putc(mimeout_state.buf[i]);
4444 static void mime_prechar(nkf_char c2, nkf_char c1)
4446 if (mimeout_mode > 0){
4448 if (base64_count + mimeout_state.count/3*4> 73){
4449 (*o_base64conv)(EOF,0);
4450 OCONV_NEWLINE((*o_base64conv));
4451 (*o_base64conv)(0,SP);
4455 if (base64_count + mimeout_state.count/3*4> 66) {
4456 (*o_base64conv)(EOF,0);
4457 OCONV_NEWLINE((*o_base64conv));
4458 (*o_base64conv)(0,SP);
4464 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4465 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4466 open_mime(output_mode);
4467 (*o_base64conv)(EOF,0);
4468 OCONV_NEWLINE((*o_base64conv));
4469 (*o_base64conv)(0,SP);
4476 static void close_mime(void)
4484 static void eof_mime(void)
4486 switch(mimeout_mode) {
4491 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]);
4497 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]);
4502 if (mimeout_mode > 0) {
4503 if (mimeout_f!=FIXED_MIME) {
4505 } else if (mimeout_mode != 'Q')
4510 static void mimeout_addchar(nkf_char c)
4512 switch(mimeout_mode) {
4517 } else if(!nkf_isalnum(c)) {
4519 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4520 (*o_mputc)(bin2hex((c&0xf)));
4528 mimeout_state.state=c;
4529 (*o_mputc)(basis_64[c>>2]);
4534 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4535 mimeout_state.state=c;
4540 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4541 (*o_mputc)(basis_64[c & 0x3F]);
4552 static void mime_putc(nkf_char c)
4557 if (mimeout_f == FIXED_MIME){
4558 if (mimeout_mode == 'Q'){
4559 if (base64_count > 71){
4560 if (c!=CR && c!=LF) {
4562 PUT_NEWLINE((*o_mputc));
4567 if (base64_count > 71){
4569 PUT_NEWLINE((*o_mputc));
4572 if (c == EOF) { /* c==EOF */
4576 if (c != EOF) { /* c==EOF */
4582 /* mimeout_f != FIXED_MIME */
4584 if (c == EOF) { /* c==EOF */
4585 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4586 j = mimeout_state.count;
4587 mimeout_state.count = 0;
4589 if (mimeout_mode > 0) {
4590 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4592 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4595 mimeout_addchar(mimeout_state.buf[i]);
4599 mimeout_addchar(mimeout_state.buf[i]);
4603 mimeout_addchar(mimeout_state.buf[i]);
4609 mimeout_addchar(mimeout_state.buf[i]);
4615 if (mimeout_state.count > 0){
4616 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4621 if (mimeout_mode=='Q') {
4622 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4623 if (c == CR || c == LF) {
4628 } else if (c <= SP) {
4630 if (base64_count > 70) {
4631 PUT_NEWLINE((*o_mputc));
4634 if (!nkf_isblank(c)) {
4639 if (base64_count > 70) {
4641 PUT_NEWLINE((*o_mputc));
4644 open_mime(output_mode);
4646 if (!nkf_noescape_mime(c)) {
4657 if (mimeout_mode <= 0) {
4658 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4659 if (nkf_isspace(c)) {
4661 if (mimeout_mode == -1) {
4664 if (c==CR || c==LF) {
4666 open_mime(output_mode);
4672 for (i=0;i<mimeout_state.count;i++) {
4673 (*o_mputc)(mimeout_state.buf[i]);
4674 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4685 mimeout_state.buf[0] = (char)c;
4686 mimeout_state.count = 1;
4688 if (base64_count > 1
4689 && base64_count + mimeout_state.count > 76
4690 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4691 PUT_NEWLINE((*o_mputc));
4693 if (!nkf_isspace(mimeout_state.buf[0])){
4698 mimeout_state.buf[mimeout_state.count++] = (char)c;
4699 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4700 open_mime(output_mode);
4705 if (lastchar==CR || lastchar == LF){
4706 for (i=0;i<mimeout_state.count;i++) {
4707 (*o_mputc)(mimeout_state.buf[i]);
4710 mimeout_state.count = 0;
4713 for (i=0;i<mimeout_state.count-1;i++) {
4714 (*o_mputc)(mimeout_state.buf[i]);
4717 mimeout_state.buf[0] = SP;
4718 mimeout_state.count = 1;
4720 open_mime(output_mode);
4723 /* mimeout_mode == 'B', 1, 2 */
4724 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4725 if (lastchar == CR || lastchar == LF){
4726 if (nkf_isblank(c)) {
4727 for (i=0;i<mimeout_state.count;i++) {
4728 mimeout_addchar(mimeout_state.buf[i]);
4730 mimeout_state.count = 0;
4731 } else if (SP<c && c<DEL) {
4733 for (i=0;i<mimeout_state.count;i++) {
4734 (*o_mputc)(mimeout_state.buf[i]);
4737 mimeout_state.count = 0;
4739 mimeout_state.buf[mimeout_state.count++] = (char)c;
4742 if (c==SP || c==TAB || c==CR || c==LF) {
4743 for (i=0;i<mimeout_state.count;i++) {
4744 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4746 for (i=0;i<mimeout_state.count;i++) {
4747 (*o_mputc)(mimeout_state.buf[i]);
4750 mimeout_state.count = 0;
4753 mimeout_state.buf[mimeout_state.count++] = (char)c;
4754 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4756 for (i=0;i<mimeout_state.count;i++) {
4757 (*o_mputc)(mimeout_state.buf[i]);
4760 mimeout_state.count = 0;
4764 if (mimeout_state.count>0 && SP<c && c!='=') {
4765 mimeout_state.buf[mimeout_state.count++] = (char)c;
4766 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4767 j = mimeout_state.count;
4768 mimeout_state.count = 0;
4770 mimeout_addchar(mimeout_state.buf[i]);
4777 if (mimeout_state.count>0) {
4778 j = mimeout_state.count;
4779 mimeout_state.count = 0;
4781 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4783 mimeout_addchar(mimeout_state.buf[i]);
4789 (*o_mputc)(mimeout_state.buf[i]);
4791 open_mime(output_mode);
4797 void base64_conv(nkf_char c2, nkf_char c1)
4799 mime_prechar(c2, c1);
4800 (*o_base64conv)(c2,c1);
4804 typedef struct nkf_iconv_t {
4807 size_t input_buffer_size;
4808 char *output_buffer;
4809 size_t output_buffer_size;
4812 nkf_iconv_t nkf_iconv_new(char *tocode, char *fromcode)
4814 nkf_iconv_t converter;
4816 converter->input_buffer_size = IOBUF_SIZE;
4817 converter->input_buffer = malloc(converter->input_buffer_size);
4818 if (converter->input_buffer == NULL)
4819 perror("can't malloc");
4821 converter->output_buffer_size = IOBUF_SIZE * 2;
4822 converter->output_buffer = malloc(converter->output_buffer_size);
4823 if (converter->output_buffer == NULL)
4824 perror("can't malloc");
4826 converter->cd = iconv_open(tocode, fromcode);
4827 if (converter->cd == (iconv_t)-1)
4831 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
4834 perror("can't iconv_open");
4839 size_t nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
4841 size_t invalid = (size_t)0;
4842 char *input_buffer = converter->input_buffer;
4843 size_t input_length = (size_t)0;
4844 char *output_buffer = converter->output_buffer;
4845 size_t output_length = converter->output_buffer_size;
4850 while ((c = (*i_getc)(f)) != EOF) {
4851 input_buffer[input_length++] = c;
4852 if (input_length < converter->input_buffer_size) break;
4856 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
4857 while (output_length-- > 0) {
4858 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
4860 if (ret == (size_t) - 1) {
4863 if (input_buffer != converter->input_buffer)
4864 memmove(converter->input_buffer, input_buffer, input_length);
4867 converter->output_buffer_size *= 2;
4868 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
4869 if (output_buffer == NULL) {
4870 perror("can't realloc");
4873 converter->output_buffer = output_buffer;
4876 perror("can't iconv");
4888 void nkf_iconv_close(nkf_iconv_t *convert)
4890 free(converter->inbuf);
4891 free(converter->outbuf);
4892 iconv_close(converter->cd);
4900 struct input_code *p = input_code_list;
4912 mime_f = MIME_DECODE_DEFAULT;
4913 mime_decode_f = FALSE;
4918 x0201_f = X0201_DEFAULT;
4919 iso2022jp_f = FALSE;
4920 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
4921 ms_ucs_map_f = UCS_MAP_ASCII;
4923 #ifdef UTF8_INPUT_ENABLE
4924 no_cp932ext_f = FALSE;
4925 no_best_fit_chars_f = FALSE;
4926 encode_fallback = NULL;
4927 unicode_subchar = '?';
4928 input_endian = ENDIAN_BIG;
4930 #ifdef UTF8_OUTPUT_ENABLE
4931 output_bom_f = FALSE;
4932 output_endian = ENDIAN_BIG;
4934 #ifdef UNICODE_NORMALIZATION
4950 #ifdef SHIFTJIS_CP932
4960 for (i = 0; i < 256; i++){
4961 prefix_table[i] = 0;
4965 mimeout_state.count = 0;
4970 fold_preserve_f = FALSE;
4973 kanji_intro = DEFAULT_J;
4974 ascii_intro = DEFAULT_R;
4975 fold_margin = FOLD_MARGIN;
4976 o_zconv = no_connection;
4977 o_fconv = no_connection;
4978 o_eol_conv = no_connection;
4979 o_rot_conv = no_connection;
4980 o_hira_conv = no_connection;
4981 o_base64conv = no_connection;
4982 o_iso2022jp_check_conv = no_connection;
4985 i_ungetc = std_ungetc;
4987 i_bungetc = std_ungetc;
4990 i_mungetc = std_ungetc;
4991 i_mgetc_buf = std_getc;
4992 i_mungetc_buf = std_ungetc;
4993 output_mode = ASCII;
4995 mime_decode_mode = FALSE;
5001 init_broken_state();
5002 z_prev2=0,z_prev1=0;
5004 iconv_for_check = 0;
5006 input_codename = NULL;
5007 input_encoding = NULL;
5008 output_encoding = NULL;
5014 int module_connection(void)
5016 if (input_encoding) set_input_encoding(input_encoding);
5017 if (!output_encoding) {
5018 output_encoding = nkf_default_encoding();
5020 if (!output_encoding) {
5021 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5024 set_output_encoding(output_encoding);
5025 oconv = nkf_enc_to_oconv(output_encoding);
5028 /* replace continucation module, from output side */
5030 /* output redicrection */
5032 if (noout_f || guess_f){
5039 if (mimeout_f == TRUE) {
5040 o_base64conv = oconv; oconv = base64_conv;
5042 /* base64_count = 0; */
5045 if (eolmode_f || guess_f) {
5046 o_eol_conv = oconv; oconv = eol_conv;
5049 o_rot_conv = oconv; oconv = rot_conv;
5052 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5055 o_hira_conv = oconv; oconv = hira_conv;
5058 o_fconv = oconv; oconv = fold_conv;
5061 if (alpha_f || x0201_f) {
5062 o_zconv = oconv; oconv = z_conv;
5066 i_ungetc = std_ungetc;
5067 /* input redicrection */
5070 i_cgetc = i_getc; i_getc = cap_getc;
5071 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5074 i_ugetc = i_getc; i_getc = url_getc;
5075 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5078 #ifdef NUMCHAR_OPTION
5080 i_ngetc = i_getc; i_getc = numchar_getc;
5081 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5084 #ifdef UNICODE_NORMALIZATION
5086 i_nfc_getc = i_getc; i_getc = nfc_getc;
5087 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5090 if (mime_f && mimebuf_f==FIXED_MIME) {
5091 i_mgetc = i_getc; i_getc = mime_getc;
5092 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5095 i_bgetc = i_getc; i_getc = broken_getc;
5096 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5098 if (input_encoding) {
5099 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5101 set_iconv(FALSE, e_iconv);
5105 struct input_code *p = input_code_list;
5114 Conversion main loop. Code detection only.
5117 #if !defined(PERL_XS) && !defined(WIN32DLL)
5118 nkf_char noconvert(FILE *f)
5123 module_connection();
5124 while ((c = (*i_getc)(f)) != EOF)
5131 int kanji_convert(FILE *f)
5133 nkf_char c1=0, c2=0, c3=0, c4=0;
5134 int shift_mode = FALSE; /* TRUE or FALSE or JIS_X_0201_1976_K */
5135 int is_8bit = FALSE;
5137 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5142 output_mode = ASCII;
5144 #define NEXT continue /* no output, get next */
5145 #define SKIP c2=0;continue /* no output, get next */
5146 #define MORE c2=c1;continue /* need one more byte */
5147 #define SEND ; /* output c1 and c2, get next */
5148 #define LAST break /* end of loop, go closing */
5150 if (module_connection() < 0) {
5151 #if !defined(PERL_XS) && !defined(WIN32DLL)
5152 fprintf(stderr, "no output encoding given\n");
5158 #ifdef UTF8_INPUT_ENABLE
5159 if(iconv == w_iconv32){
5160 while ((c1 = (*i_getc)(f)) != EOF &&
5161 (c2 = (*i_getc)(f)) != EOF &&
5162 (c3 = (*i_getc)(f)) != EOF &&
5163 (c4 = (*i_getc)(f)) != EOF) {
5164 nkf_iconv_utf_32(c1, c2, c3, c4);
5166 (*i_ungetc)(EOF, f);
5168 else if (iconv == w_iconv16) {
5169 while ((c1 = (*i_getc)(f)) != EOF &&
5170 (c2 = (*i_getc)(f)) != EOF) {
5171 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5172 (c3 = (*i_getc)(f)) != EOF &&
5173 (c4 = (*i_getc)(f)) != EOF) {
5174 nkf_iconv_utf_16(c1, c2, c3, c4);
5177 (*i_ungetc)(EOF, f);
5181 while ((c1 = (*i_getc)(f)) != EOF) {
5182 #ifdef INPUT_CODE_FIX
5183 if (!input_encoding)
5189 /* in case of 8th bit is on */
5190 if (!estab_f&&!mime_decode_mode) {
5191 /* in case of not established yet */
5192 /* It is still ambiguious */
5193 if (h_conv(f, c2, c1)==EOF)
5198 /* in case of already established */
5200 /* ignore bogus code */
5207 /* 2nd byte of 7 bit code or SJIS */
5212 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5215 } else if (c1 > DEL) {
5217 if (!estab_f && !iso8859_f) {
5218 /* not established yet */
5220 } else { /* estab_f==TRUE */
5226 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5227 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5229 c2 = JIS_X_0201_1976_K;
5234 /* already established */
5238 } else if (SP < c1 && c1 < DEL) {
5239 /* in case of Roman characters */
5241 /* output 1 shifted byte */
5245 } else if (SP <= c1 && c1 < (0xE0&0x7F)){
5246 /* output 1 shifted byte */
5247 c2 = JIS_X_0201_1976_K;
5250 /* look like bogus code */
5253 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5254 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5255 /* in case of Kanji shifted */
5257 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5258 /* Check MIME code */
5259 if ((c1 = (*i_getc)(f)) == EOF) {
5262 } else if (c1 == '?') {
5263 /* =? is mime conversion start sequence */
5264 if(mime_f == STRICT_MIME) {
5265 /* check in real detail */
5266 if (mime_begin_strict(f) == EOF)
5269 } else if (mime_begin(f) == EOF)
5278 /* normal ASCII code */
5281 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5284 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5287 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5288 if ((c1 = (*i_getc)(f)) == EOF) {
5289 /* (*oconv)(0, ESC); don't send bogus code */
5291 } else if (c1 == '$') {
5292 if ((c1 = (*i_getc)(f)) == EOF) {
5294 (*oconv)(0, ESC); don't send bogus code
5295 (*oconv)(0, '$'); */
5297 } else if (c1 == '@'|| c1 == 'B') {
5298 /* This is kanji introduction */
5299 input_mode = JIS_X_0208;
5301 set_input_codename("ISO-2022-JP");
5303 debug("ISO-2022-JP");
5306 } else if (c1 == '(') {
5307 if ((c1 = (*i_getc)(f)) == EOF) {
5308 /* don't send bogus code
5314 } else if (c1 == '@'|| c1 == 'B') {
5315 /* This is kanji introduction */
5316 input_mode = JIS_X_0208;
5320 } else if (c1 == 'D'){
5321 input_mode = JIS_X_0212;
5324 #endif /* X0212_ENABLE */
5325 } else if (c1 == 'O' || c1 == 'Q'){
5326 input_mode = JIS_X_0213_1;
5329 } else if (c1 == 'P'){
5330 input_mode = JIS_X_0213_2;
5334 /* could be some special code */
5341 } else if (broken_f&0x2) {
5342 /* accept any ESC-(-x as broken code ... */
5343 input_mode = JIS_X_0208;
5352 } else if (c1 == '(') {
5353 if ((c1 = (*i_getc)(f)) == EOF) {
5354 /* don't send bogus code
5356 (*oconv)(0, '('); */
5360 /* This is X0201 kana introduction */
5361 input_mode = JIS_X_0201_1976_K; shift_mode = JIS_X_0201_1976_K;
5363 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5364 /* This is X0208 kanji introduction */
5365 input_mode = ASCII; shift_mode = FALSE;
5367 } else if (broken_f&0x2) {
5368 input_mode = ASCII; shift_mode = FALSE;
5373 /* maintain various input_mode here */
5377 } else if ( c1 == 'N' || c1 == 'n'){
5379 c4 = (*i_getc)(f); /* skip SS2 */
5380 if ( (SP<=c4 && c4 < 0x60) || (0xa0<=c4 && c4 < 0xe0)){
5382 c2 = JIS_X_0201_1976_K;
5395 } else if (c1 == ESC && iconv == s_iconv) {
5396 /* ESC in Shift_JIS */
5397 if ((c1 = (*i_getc)(f)) == EOF) {
5398 /* (*oconv)(0, ESC); don't send bogus code */
5400 } else if (c1 == '$') {
5402 if ((c1 = (*i_getc)(f)) == EOF) {
5404 (*oconv)(0, ESC); don't send bogus code
5405 (*oconv)(0, '$'); */
5408 if (('E' <= c1 && c1 <= 'G') ||
5409 ('O' <= c1 && c1 <= 'Q')) {
5417 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
5418 c3 = nkf_char_unicode_new((jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000);
5419 while ((c1 = (*i_getc)(f)) != EOF) {
5420 if (SP <= c1 && c1 <= 'z') {
5421 (*oconv)(0, c1 + c3);
5422 } else break; /* c1 == SO */
5426 if (c1 == EOF) LAST;
5433 } else if (c1 == LF || c1 == CR) {
5435 input_mode = ASCII; set_iconv(FALSE, 0);
5437 } else if (mime_decode_f && !mime_decode_mode){
5439 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5447 } else { /* if (c1 == CR)*/
5448 if ((c1=(*i_getc)(f))!=EOF) {
5452 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5472 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5475 if ((c3 = (*i_getc)(f)) != EOF) {
5478 if ((c4 = (*i_getc)(f)) != EOF) {
5480 (*iconv)(c2, c1, c3|c4);
5485 /* 3 bytes EUC or UTF-8 */
5486 if ((c3 = (*i_getc)(f)) != EOF) {
5488 (*iconv)(c2, c1, c3);
5496 0x7F <= c2 && c2 <= 0x92 &&
5497 0x21 <= c1 && c1 <= 0x7E) {
5501 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5504 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5508 (*oconv)(PREFIX_EUCG3 | c2, c1);
5510 #endif /* X0212_ENABLE */
5512 (*oconv)(PREFIX_EUCG3 | c2, c1);
5515 (*oconv)(input_mode, c1); /* other special case */
5521 /* goto next_word */
5525 (*iconv)(EOF, 0, 0);
5526 if (!input_codename)
5529 struct input_code *p = input_code_list;
5530 struct input_code *result = p;
5532 if (p->score < result->score) result = p;
5535 set_input_codename(result->name);
5537 debug(result->name);
5545 * int options(unsigned char *cp)
5551 int options(unsigned char *cp)
5555 unsigned char *cp_back = NULL;
5561 while(*cp && *cp++!='-');
5562 while (*cp || cp_back) {
5570 case '-': /* literal options */
5571 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5575 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5576 p = (unsigned char *)long_option[i].name;
5577 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5578 if (*p == cp[j] || cp[j] == SP){
5585 #if !defined(PERL_XS) && !defined(WIN32DLL)
5586 fprintf(stderr, "unknown long option: --%s\n", cp);
5590 while(*cp && *cp != SP && cp++);
5591 if (long_option[i].alias[0]){
5593 cp = (unsigned char *)long_option[i].alias;
5595 if (strcmp(long_option[i].name, "ic=") == 0){
5596 nkf_str_upcase((char *)p, codeset, 32);
5597 enc = nkf_enc_find(codeset);
5599 input_encoding = enc;
5602 if (strcmp(long_option[i].name, "oc=") == 0){
5603 nkf_str_upcase((char *)p, codeset, 32);
5604 enc = nkf_enc_find(codeset);
5605 if (enc <= 0) continue;
5606 output_encoding = enc;
5609 if (strcmp(long_option[i].name, "guess=") == 0){
5610 if (p[0] == '0' || p[0] == '1') {
5618 if (strcmp(long_option[i].name, "overwrite") == 0){
5621 preserve_time_f = TRUE;
5624 if (strcmp(long_option[i].name, "overwrite=") == 0){
5627 preserve_time_f = TRUE;
5629 backup_suffix = malloc(strlen((char *) p) + 1);
5630 strcpy(backup_suffix, (char *) p);
5633 if (strcmp(long_option[i].name, "in-place") == 0){
5636 preserve_time_f = FALSE;
5639 if (strcmp(long_option[i].name, "in-place=") == 0){
5642 preserve_time_f = FALSE;
5644 backup_suffix = malloc(strlen((char *) p) + 1);
5645 strcpy(backup_suffix, (char *) p);
5650 if (strcmp(long_option[i].name, "cap-input") == 0){
5654 if (strcmp(long_option[i].name, "url-input") == 0){
5659 #ifdef NUMCHAR_OPTION
5660 if (strcmp(long_option[i].name, "numchar-input") == 0){
5666 if (strcmp(long_option[i].name, "no-output") == 0){
5670 if (strcmp(long_option[i].name, "debug") == 0){
5675 if (strcmp(long_option[i].name, "cp932") == 0){
5676 #ifdef SHIFTJIS_CP932
5680 #ifdef UTF8_OUTPUT_ENABLE
5681 ms_ucs_map_f = UCS_MAP_CP932;
5685 if (strcmp(long_option[i].name, "no-cp932") == 0){
5686 #ifdef SHIFTJIS_CP932
5690 #ifdef UTF8_OUTPUT_ENABLE
5691 ms_ucs_map_f = UCS_MAP_ASCII;
5695 #ifdef SHIFTJIS_CP932
5696 if (strcmp(long_option[i].name, "cp932inv") == 0){
5703 if (strcmp(long_option[i].name, "x0212") == 0){
5710 if (strcmp(long_option[i].name, "exec-in") == 0){
5714 if (strcmp(long_option[i].name, "exec-out") == 0){
5719 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5720 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5721 no_cp932ext_f = TRUE;
5724 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5725 no_best_fit_chars_f = TRUE;
5728 if (strcmp(long_option[i].name, "fb-skip") == 0){
5729 encode_fallback = NULL;
5732 if (strcmp(long_option[i].name, "fb-html") == 0){
5733 encode_fallback = encode_fallback_html;
5736 if (strcmp(long_option[i].name, "fb-xml") == 0){
5737 encode_fallback = encode_fallback_xml;
5740 if (strcmp(long_option[i].name, "fb-java") == 0){
5741 encode_fallback = encode_fallback_java;
5744 if (strcmp(long_option[i].name, "fb-perl") == 0){
5745 encode_fallback = encode_fallback_perl;
5748 if (strcmp(long_option[i].name, "fb-subchar") == 0){
5749 encode_fallback = encode_fallback_subchar;
5752 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
5753 encode_fallback = encode_fallback_subchar;
5754 unicode_subchar = 0;
5756 /* decimal number */
5757 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
5758 unicode_subchar *= 10;
5759 unicode_subchar += hex2bin(p[i]);
5761 }else if(p[1] == 'x' || p[1] == 'X'){
5762 /* hexadecimal number */
5763 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
5764 unicode_subchar <<= 4;
5765 unicode_subchar |= hex2bin(p[i]);
5769 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
5770 unicode_subchar *= 8;
5771 unicode_subchar += hex2bin(p[i]);
5774 w16e_conv(unicode_subchar, &i, &j);
5775 unicode_subchar = i<<8 | j;
5779 #ifdef UTF8_OUTPUT_ENABLE
5780 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
5781 ms_ucs_map_f = UCS_MAP_MS;
5785 #ifdef UNICODE_NORMALIZATION
5786 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
5791 if (strcmp(long_option[i].name, "prefix=") == 0){
5792 if (nkf_isgraph(p[0])){
5793 for (i = 1; nkf_isgraph(p[i]); i++){
5794 prefix_table[p[i]] = p[0];
5799 #if !defined(PERL_XS) && !defined(WIN32DLL)
5800 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
5805 case 'b': /* buffered mode */
5808 case 'u': /* non bufferd mode */
5811 case 't': /* transparent mode */
5816 } else if (*cp=='2') {
5820 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
5828 case 'j': /* JIS output */
5830 output_encoding = nkf_enc_from_index(ISO_2022_JP);
5832 case 'e': /* AT&T EUC output */
5833 output_encoding = nkf_enc_from_index(EUCJP_NKF);
5835 case 's': /* SJIS output */
5836 output_encoding = nkf_enc_from_index(WINDOWS_31J);
5838 case 'l': /* ISO8859 Latin-1 support, no conversion */
5839 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
5840 input_encoding = nkf_enc_from_index(ISO_8859_1);
5842 case 'i': /* Kanji IN ESC-$-@/B */
5843 if (*cp=='@'||*cp=='B')
5844 kanji_intro = *cp++;
5846 case 'o': /* ASCII IN ESC-(-J/B */
5847 if (*cp=='J'||*cp=='B'||*cp=='H')
5848 ascii_intro = *cp++;
5852 bit:1 katakana->hiragana
5853 bit:2 hiragana->katakana
5855 if ('9'>= *cp && *cp>='0')
5856 hira_f |= (*cp++ -'0');
5863 #if defined(MSDOS) || defined(__OS2__)
5870 show_configuration();
5878 #ifdef UTF8_OUTPUT_ENABLE
5879 case 'w': /* UTF-8 output */
5884 output_encoding = nkf_enc_from_index(UTF_8N);
5886 output_bom_f = TRUE;
5887 output_encoding = nkf_enc_from_index(UTF_8_BOM);
5891 if ('1'== cp[0] && '6'==cp[1]) {
5894 } else if ('3'== cp[0] && '2'==cp[1]) {
5898 output_encoding = nkf_enc_from_index(UTF_8);
5903 output_endian = ENDIAN_LITTLE;
5904 } else if (cp[0] == 'B') {
5907 output_encoding = nkf_enc_from_index(enc_idx);
5912 enc_idx = enc_idx == UTF_16
5913 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5914 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5916 output_bom_f = TRUE;
5917 enc_idx = enc_idx == UTF_16
5918 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
5919 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
5921 output_encoding = nkf_enc_from_index(enc_idx);
5925 #ifdef UTF8_INPUT_ENABLE
5926 case 'W': /* UTF input */
5929 input_encoding = nkf_enc_from_index(UTF_8);
5932 if ('1'== cp[0] && '6'==cp[1]) {
5934 input_endian = ENDIAN_BIG;
5936 } else if ('3'== cp[0] && '2'==cp[1]) {
5938 input_endian = ENDIAN_BIG;
5941 input_encoding = nkf_enc_from_index(UTF_8);
5946 input_endian = ENDIAN_LITTLE;
5947 } else if (cp[0] == 'B') {
5949 input_endian = ENDIAN_BIG;
5951 enc_idx = enc_idx == UTF_16
5952 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
5953 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
5954 input_encoding = nkf_enc_from_index(enc_idx);
5958 /* Input code assumption */
5959 case 'J': /* ISO-2022-JP input */
5960 input_encoding = nkf_enc_from_index(ISO_2022_JP);
5962 case 'E': /* EUC-JP input */
5963 input_encoding = nkf_enc_from_index(EUCJP_NKF);
5965 case 'S': /* Windows-31J input */
5966 input_encoding = nkf_enc_from_index(WINDOWS_31J);
5968 case 'Z': /* Convert X0208 alphabet to asii */
5970 bit:0 Convert JIS X 0208 Alphabet to ASCII
5971 bit:1 Convert Kankaku to one space
5972 bit:2 Convert Kankaku to two spaces
5973 bit:3 Convert HTML Entity
5974 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
5976 while ('0'<= *cp && *cp <='9') {
5977 alpha_f |= 1 << (*cp++ - '0');
5979 if (!alpha_f) alpha_f = 1;
5981 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
5982 x0201_f = FALSE; /* No X0201->X0208 conversion */
5984 ESC-(-I in JIS, EUC, MS Kanji
5985 SI/SO in JIS, EUC, MS Kanji
5986 SS2 in EUC, JIS, not in MS Kanji
5987 MS Kanji (0xa0-0xdf)
5989 ESC-(-I in JIS (0x20-0x5f)
5990 SS2 in EUC (0xa0-0xdf)
5991 0xa0-0xd in MS Kanji (0xa0-0xdf)
5994 case 'X': /* Convert X0201 kana to X0208 */
5997 case 'F': /* prserve new lines */
5998 fold_preserve_f = TRUE;
5999 case 'f': /* folding -f60 or -f */
6002 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6004 fold_len += *cp++ - '0';
6006 if (!(0<fold_len && fold_len<BUFSIZ))
6007 fold_len = DEFAULT_FOLD;
6011 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6013 fold_margin += *cp++ - '0';
6017 case 'm': /* MIME support */
6018 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6019 if (*cp=='B'||*cp=='Q') {
6020 mime_decode_mode = *cp++;
6021 mimebuf_f = FIXED_MIME;
6022 } else if (*cp=='N') {
6023 mime_f = TRUE; cp++;
6024 } else if (*cp=='S') {
6025 mime_f = STRICT_MIME; cp++;
6026 } else if (*cp=='0') {
6027 mime_decode_f = FALSE;
6028 mime_f = FALSE; cp++;
6030 mime_f = STRICT_MIME;
6033 case 'M': /* MIME output */
6036 mimeout_f = FIXED_MIME; cp++;
6037 } else if (*cp=='Q') {
6039 mimeout_f = FIXED_MIME; cp++;
6044 case 'B': /* Broken JIS support */
6046 bit:1 allow any x on ESC-(-x or ESC-$-x
6047 bit:2 reset to ascii on NL
6049 if ('9'>= *cp && *cp>='0')
6050 broken_f |= 1<<(*cp++ -'0');
6055 case 'O':/* for Output file */
6059 case 'c':/* add cr code */
6062 case 'd':/* delete cr code */
6065 case 'I': /* ISO-2022-JP output */
6068 case 'L': /* line mode */
6069 if (*cp=='u') { /* unix */
6070 eolmode_f = LF; cp++;
6071 } else if (*cp=='m') { /* mac */
6072 eolmode_f = CR; cp++;
6073 } else if (*cp=='w') { /* windows */
6074 eolmode_f = CRLF; cp++;
6075 } else if (*cp=='0') { /* no conversion */
6076 eolmode_f = 0; cp++;
6081 if ('2' <= *cp && *cp <= '9') {
6084 } else if (*cp == '0' || *cp == '1') {
6093 /* module muliple options in a string are allowed for Perl moudle */
6094 while(*cp && *cp++!='-');
6097 #if !defined(PERL_XS) && !defined(WIN32DLL)
6098 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6100 /* bogus option but ignored */
6108 #include "nkf32dll.c"
6109 #elif defined(PERL_XS)
6110 #else /* WIN32DLL */
6111 int main(int argc, char **argv)
6116 char *outfname = NULL;
6119 #ifdef EASYWIN /*Easy Win */
6120 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6122 #ifdef DEFAULT_CODE_LOCALE
6123 setlocale(LC_CTYPE, "");
6125 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6126 cp = (unsigned char *)*argv;
6131 if (pipe(fds) < 0 || (pid = fork()) < 0){
6142 execvp(argv[1], &argv[1]);
6159 int debug_f_back = debug_f;
6162 int exec_f_back = exec_f;
6165 int x0212_f_back = x0212_f;
6167 int x0213_f_back = x0213_f;
6168 int guess_f_back = guess_f;
6170 guess_f = guess_f_back;
6173 debug_f = debug_f_back;
6176 exec_f = exec_f_back;
6178 x0212_f = x0212_f_back;
6179 x0213_f = x0213_f_back;
6182 if (binmode_f == TRUE)
6183 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6184 if (freopen("","wb",stdout) == NULL)
6191 setbuf(stdout, (char *) NULL);
6193 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6196 if (binmode_f == TRUE)
6197 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6198 if (freopen("","rb",stdin) == NULL) return (-1);
6202 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6206 kanji_convert(stdin);
6207 if (guess_f) print_guessed_code(NULL);
6211 int is_argument_error = FALSE;
6213 input_codename = NULL;
6216 iconv_for_check = 0;
6218 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6220 is_argument_error = TRUE;
6228 /* reopen file for stdout */
6229 if (file_out_f == TRUE) {
6232 outfname = malloc(strlen(origfname)
6233 + strlen(".nkftmpXXXXXX")
6239 strcpy(outfname, origfname);
6243 for (i = strlen(outfname); i; --i){
6244 if (outfname[i - 1] == '/'
6245 || outfname[i - 1] == '\\'){
6251 strcat(outfname, "ntXXXXXX");
6253 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6254 S_IREAD | S_IWRITE);
6256 strcat(outfname, ".nkftmpXXXXXX");
6257 fd = mkstemp(outfname);
6260 || (fd_backup = dup(fileno(stdout))) < 0
6261 || dup2(fd, fileno(stdout)) < 0
6272 outfname = "nkf.out";
6275 if(freopen(outfname, "w", stdout) == NULL) {
6279 if (binmode_f == TRUE) {
6280 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6281 if (freopen("","wb",stdout) == NULL)
6288 if (binmode_f == TRUE)
6289 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6290 if (freopen("","rb",fin) == NULL)
6295 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6299 char *filename = NULL;
6301 if (nfiles > 1) filename = origfname;
6302 if (guess_f) print_guessed_code(filename);
6308 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6316 if (dup2(fd_backup, fileno(stdout)) < 0){
6319 if (stat(origfname, &sb)) {
6320 fprintf(stderr, "Can't stat %s\n", origfname);
6322 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6323 if (chmod(outfname, sb.st_mode)) {
6324 fprintf(stderr, "Can't set permission %s\n", outfname);
6327 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6328 if(preserve_time_f){
6329 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6330 tb[0] = tb[1] = sb.st_mtime;
6331 if (utime(outfname, tb)) {
6332 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6335 tb.actime = sb.st_atime;
6336 tb.modtime = sb.st_mtime;
6337 if (utime(outfname, &tb)) {
6338 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6343 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6345 unlink(backup_filename);
6347 if (rename(origfname, backup_filename)) {
6348 perror(backup_filename);
6349 fprintf(stderr, "Can't rename %s to %s\n",
6350 origfname, backup_filename);
6354 if (unlink(origfname)){
6359 if (rename(outfname, origfname)) {
6361 fprintf(stderr, "Can't rename %s to %s\n",
6362 outfname, origfname);
6369 if (is_argument_error)
6372 #ifdef EASYWIN /*Easy Win */
6373 if (file_out_f == FALSE)
6374 scanf("%d",&end_check);
6377 #else /* for Other OS */
6378 if (file_out_f == TRUE)
6380 #endif /*Easy Win */
6383 #endif /* WIN32DLL */