2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2009, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.0.9"
24 #define NKF_RELEASE_DATE "2009-01-20"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2009, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
213 {"ISO-2022-JP", ISO_2022_JP},
214 {"ISO2022JP-CP932", CP50220},
215 {"CP50220", CP50220},
216 {"CP50221", CP50221},
217 {"CSISO2022JP", CP50221},
218 {"CP50222", CP50222},
219 {"ISO-2022-JP-1", ISO_2022_JP_1},
220 {"ISO-2022-JP-3", ISO_2022_JP_3},
221 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
222 {"SHIFT_JIS", SHIFT_JIS},
224 {"WINDOWS-31J", WINDOWS_31J},
225 {"CSWINDOWS31J", WINDOWS_31J},
226 {"CP932", WINDOWS_31J},
227 {"MS932", WINDOWS_31J},
228 {"CP10001", CP10001},
231 {"EUCJP-NKF", EUCJP_NKF},
232 {"CP51932", CP51932},
233 {"EUC-JP-MS", EUCJP_MS},
234 {"EUCJP-MS", EUCJP_MS},
235 {"EUCJPMS", EUCJP_MS},
236 {"EUC-JP-ASCII", EUCJP_ASCII},
237 {"EUCJP-ASCII", EUCJP_ASCII},
238 {"SHIFT_JISX0213", SHIFT_JISX0213},
239 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
240 {"EUC-JISX0213", EUC_JISX0213},
241 {"EUC-JIS-2004", EUC_JIS_2004},
244 {"UTF-8-BOM", UTF_8_BOM},
245 {"UTF8-MAC", UTF8_MAC},
246 {"UTF-8-MAC", UTF8_MAC},
248 {"UTF-16BE", UTF_16BE},
249 {"UTF-16BE-BOM", UTF_16BE_BOM},
250 {"UTF-16LE", UTF_16LE},
251 {"UTF-16LE-BOM", UTF_16LE_BOM},
253 {"UTF-32BE", UTF_32BE},
254 {"UTF-32BE-BOM", UTF_32BE_BOM},
255 {"UTF-32LE", UTF_32LE},
256 {"UTF-32LE-BOM", UTF_32LE_BOM},
261 #if defined(DEFAULT_CODE_JIS)
262 #define DEFAULT_ENCIDX ISO_2022_JP
263 #elif defined(DEFAULT_CODE_SJIS)
264 #define DEFAULT_ENCIDX SHIFT_JIS
265 #elif defined(DEFAULT_CODE_WINDOWS_31J)
266 #define DEFAULT_ENCIDX WINDOWS_31J
267 #elif defined(DEFAULT_CODE_EUC)
268 #define DEFAULT_ENCIDX EUC_JP
269 #elif defined(DEFAULT_CODE_UTF8)
270 #define DEFAULT_ENCIDX UTF_8
274 #define is_alnum(c) \
275 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
277 /* I don't trust portablity of toupper */
278 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
279 #define nkf_isoctal(c) ('0'<=c && c<='7')
280 #define nkf_isdigit(c) ('0'<=c && c<='9')
281 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
282 #define nkf_isblank(c) (c == SP || c == TAB)
283 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
284 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
285 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
286 #define nkf_isprint(c) (SP<=c && c<='~')
287 #define nkf_isgraph(c) ('!'<=c && c<='~')
288 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
289 ('A'<=c&&c<='F') ? (c-'A'+10) : \
290 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
291 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
292 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
293 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
294 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
295 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
297 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
298 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
300 #define HOLD_SIZE 1024
301 #if defined(INT_IS_SHORT)
302 #define IOBUF_SIZE 2048
304 #define IOBUF_SIZE 16384
307 #define DEFAULT_J 'B'
308 #define DEFAULT_R 'B'
315 /* MIME preprocessor */
317 #ifdef EASYWIN /*Easy Win */
318 extern POINT _BufferSize;
327 void (*status_func)(struct input_code *, nkf_char);
328 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
332 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
333 static nkf_encoding *input_encoding = NULL;
334 static nkf_encoding *output_encoding = NULL;
336 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
338 * 0: Shift_JIS, eucJP-ascii
343 #define UCS_MAP_ASCII 0
345 #define UCS_MAP_CP932 2
346 #define UCS_MAP_CP10001 3
347 static int ms_ucs_map_f = UCS_MAP_ASCII;
349 #ifdef UTF8_INPUT_ENABLE
350 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
351 static int no_cp932ext_f = FALSE;
352 /* ignore ZERO WIDTH NO-BREAK SPACE */
353 static int no_best_fit_chars_f = FALSE;
354 static int input_endian = ENDIAN_BIG;
355 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
356 static void (*encode_fallback)(nkf_char c) = NULL;
357 static void w_status(struct input_code *, nkf_char);
359 #ifdef UTF8_OUTPUT_ENABLE
360 static int output_bom_f = FALSE;
361 static int output_endian = ENDIAN_BIG;
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_getc(FILE *f);
373 static void mime_putc(nkf_char c);
377 #if !defined(PERL_XS) && !defined(WIN32DLL)
378 static unsigned char stdibuf[IOBUF_SIZE];
379 static unsigned char stdobuf[IOBUF_SIZE];
383 static int unbuf_f = FALSE;
384 static int estab_f = FALSE;
385 static int nop_f = FALSE;
386 static int binmode_f = TRUE; /* binary mode */
387 static int rot_f = FALSE; /* rot14/43 mode */
388 static int hira_f = FALSE; /* hira/kata henkan */
389 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
390 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
391 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
392 static int mimebuf_f = FALSE; /* MIME buffered input */
393 static int broken_f = FALSE; /* convert ESC-less broken JIS */
394 static int iso8859_f = FALSE; /* ISO8859 through */
395 static int mimeout_f = FALSE; /* base64 mode */
396 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
397 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
399 #ifdef UNICODE_NORMALIZATION
400 static int nfc_f = FALSE;
401 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
402 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
406 static int cap_f = FALSE;
407 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
408 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
410 static int url_f = FALSE;
411 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
412 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
415 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
416 #define CLASS_MASK NKF_INT32_C(0xFF000000)
417 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
418 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
419 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
420 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
421 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
422 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
423 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
424 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
425 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
427 #ifdef NUMCHAR_OPTION
428 static int numchar_f = FALSE;
429 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
430 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
434 static int noout_f = FALSE;
435 static void no_putc(nkf_char c);
436 static int debug_f = FALSE;
437 static void debug(const char *str);
438 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
441 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
442 static void set_input_codename(const char *codename);
445 static int exec_f = 0;
448 #ifdef SHIFTJIS_CP932
449 /* invert IBM extended characters to others */
450 static int cp51932_f = FALSE;
452 /* invert NEC-selected IBM extended characters to IBM extended characters */
453 static int cp932inv_f = TRUE;
455 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
456 #endif /* SHIFTJIS_CP932 */
458 static int x0212_f = FALSE;
459 static int x0213_f = FALSE;
461 static unsigned char prefix_table[256];
463 static void e_status(struct input_code *, nkf_char);
464 static void s_status(struct input_code *, nkf_char);
466 struct input_code input_code_list[] = {
467 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
468 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
469 #ifdef UTF8_INPUT_ENABLE
470 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
475 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
476 static int base64_count = 0;
478 /* X0208 -> ASCII converter */
481 static int f_line = 0; /* chars in line */
482 static int f_prev = 0;
483 static int fold_preserve_f = FALSE; /* preserve new lines */
484 static int fold_f = FALSE;
485 static int fold_len = 0;
488 static unsigned char kanji_intro = DEFAULT_J;
489 static unsigned char ascii_intro = DEFAULT_R;
493 #define FOLD_MARGIN 10
494 #define DEFAULT_FOLD 60
496 static int fold_margin = FOLD_MARGIN;
498 /* process default */
501 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
503 fprintf(stderr,"nkf internal module connection failure.\n");
509 no_connection(nkf_char c2, nkf_char c1)
511 no_connection2(c2,c1,0);
514 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
515 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
517 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
518 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
519 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
520 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
521 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
522 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
523 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
525 /* static redirections */
527 static void (*o_putc)(nkf_char c) = std_putc;
529 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
530 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
532 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
533 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
535 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
537 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
538 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
540 /* for strict mime */
541 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
542 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
545 static int output_mode = ASCII; /* output kanji mode */
546 static int input_mode = ASCII; /* input kanji mode */
547 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
549 /* X0201 / X0208 conversion tables */
551 /* X0201 kana conversion table */
553 static const unsigned char cv[]= {
554 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
555 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
556 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
557 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
558 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
559 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
560 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
561 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
562 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
563 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
564 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
565 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
566 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
567 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
568 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
569 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
573 /* X0201 kana conversion table for daguten */
575 static const unsigned char dv[]= {
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
580 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
581 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
582 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
583 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
584 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
585 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
586 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
587 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
594 /* X0201 kana conversion table for han-daguten */
596 static const unsigned char ev[]= {
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
608 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 /* X0208 kigou conversion table */
617 /* 0x8140 - 0x819e */
618 static const unsigned char fv[] = {
620 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
621 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
622 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
624 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
625 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
626 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
627 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
628 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
630 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
636 static int option_mode = 0;
637 static int file_out_f = FALSE;
639 static int overwrite_f = FALSE;
640 static int preserve_time_f = FALSE;
641 static int backup_f = FALSE;
642 static char *backup_suffix = "";
645 static int eolmode_f = 0; /* CR, LF, CRLF */
646 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
647 static nkf_char prev_cr = 0; /* CR or 0 */
648 #ifdef EASYWIN /*Easy Win */
649 static int end_check;
653 nkf_xmalloc(size_t size)
657 if (size == 0) size = 1;
661 perror("can't malloc");
669 nkf_xrealloc(void *ptr, size_t size)
671 if (size == 0) size = 1;
673 ptr = realloc(ptr, size);
675 perror("can't realloc");
682 #define nkf_xfree(ptr) free(ptr)
685 nkf_str_caseeql(const char *src, const char *target)
688 for (i = 0; src[i] && target[i]; i++) {
689 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
691 if (src[i] || target[i]) return FALSE;
696 nkf_enc_from_index(int idx)
698 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
701 return &nkf_encoding_table[idx];
705 nkf_enc_find_index(const char *name)
708 if (name[0] == 'X' && *(name+1) == '-') name += 2;
709 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
710 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
711 return encoding_name_to_id_table[i].id;
718 nkf_enc_find(const char *name)
721 idx = nkf_enc_find_index(name);
722 if (idx < 0) return 0;
723 return nkf_enc_from_index(idx);
726 #define nkf_enc_name(enc) (enc)->name
727 #define nkf_enc_to_index(enc) (enc)->id
728 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
729 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
730 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
731 #define nkf_enc_asciicompat(enc) (\
732 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
733 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
734 #define nkf_enc_unicode_p(enc) (\
735 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
736 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
737 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
738 #define nkf_enc_cp5022x_p(enc) (\
739 nkf_enc_to_index(enc) == CP50220 ||\
740 nkf_enc_to_index(enc) == CP50221 ||\
741 nkf_enc_to_index(enc) == CP50222)
743 #ifdef DEFAULT_CODE_LOCALE
747 #ifdef HAVE_LANGINFO_H
748 return nl_langinfo(CODESET);
749 #elif defined(__WIN32__)
751 sprintf(buf, "CP%d", GetACP());
753 #elif defined(__OS2__)
754 # if defined(INT_IS_SHORT)
760 ULONG ulCP[1], ulncp;
761 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
762 if (ulCP[0] == 932 || ulCP[0] == 943)
763 strcpy(buf, "Shift_JIS");
765 sprintf(buf, "CP%lu", ulCP[0]);
773 nkf_locale_encoding()
775 nkf_encoding *enc = 0;
776 const char *encname = nkf_locale_charmap();
778 enc = nkf_enc_find(encname);
781 #endif /* DEFAULT_CODE_LOCALE */
786 return &nkf_encoding_table[UTF_8];
790 nkf_default_encoding()
792 nkf_encoding *enc = 0;
793 #ifdef DEFAULT_CODE_LOCALE
794 enc = nkf_locale_encoding();
795 #elif defined(DEFAULT_ENCIDX)
796 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
798 if (!enc) enc = nkf_utf8_encoding();
809 nkf_buf_new(int length)
811 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
812 buf->ptr = nkf_xmalloc(length);
820 nkf_buf_dispose(nkf_buf_t *buf)
827 #define nkf_buf_length(buf) ((buf)->len)
828 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
831 nkf_buf_at(nkf_buf_t *buf, int index)
833 assert(index <= buf->len);
834 return buf->ptr[index];
838 nkf_buf_clear(nkf_buf_t *buf)
844 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
846 if (buf->capa <= buf->len) {
849 buf->ptr[buf->len++] = c;
853 nkf_buf_pop(nkf_buf_t *buf)
855 assert(!nkf_buf_empty_p(buf));
856 return buf->ptr[--buf->len];
859 /* Normalization Form C */
862 #define fprintf dllprintf
868 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
875 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
876 #ifdef UTF8_OUTPUT_ENABLE
877 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
878 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
881 #ifdef UTF8_INPUT_ENABLE
882 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
883 " UTF option is -W[8,[16,32][B,L]]\n"
885 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
889 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
890 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
891 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
894 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
895 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
896 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
897 " X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
900 " O Output to File (DEFAULT 'nkf.out')\n"
901 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
904 " --ic=<encoding> Specify the input encoding\n"
905 " --oc=<encoding> Specify the output encoding\n"
906 " --hiragana --katakana Hiragana/Katakana Conversion\n"
907 " --katakana-hiragana Converts each other\n"
911 " --{cap, url}-input Convert hex after ':' or '%%'\n"
913 #ifdef NUMCHAR_OPTION
914 " --numchar-input Convert Unicode Character Reference\n"
916 #ifdef UTF8_INPUT_ENABLE
917 " --fb-{skip, html, xml, perl, java, subchar}\n"
918 " Specify unassigned character's replacement\n"
923 " --in-place[=SUF] Overwrite original files\n"
924 " --overwrite[=SUF] Preserve timestamp of original files\n"
926 " -g --guess Guess the input code\n"
927 " -v --version Print the version\n"
928 " --help/-V Print this help / configuration\n"
934 show_configuration(void)
937 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
938 " Compile-time options:\n"
939 " Compiled at: " __DATE__ " " __TIME__ "\n"
942 " Default output encoding: "
943 #ifdef DEFAULT_CODE_LOCALE
944 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
945 #elif defined(DEFAULT_ENCIDX)
946 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
952 " Default output end of line: "
953 #if DEFAULT_NEWLINE == CR
955 #elif DEFAULT_NEWLINE == CRLF
961 " Decode MIME encoded string: "
962 #if MIME_DECODE_DEFAULT
968 " Convert JIS X 0201 Katakana: "
975 " --help, --version output: "
976 #if HELP_OUTPUT_HELP_OUTPUT
987 get_backup_filename(const char *suffix, const char *filename)
989 char *backup_filename;
990 int asterisk_count = 0;
992 int filename_length = strlen(filename);
994 for(i = 0; suffix[i]; i++){
995 if(suffix[i] == '*') asterisk_count++;
999 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1000 for(i = 0, j = 0; suffix[i];){
1001 if(suffix[i] == '*'){
1002 backup_filename[j] = '\0';
1003 strncat(backup_filename, filename, filename_length);
1005 j += filename_length;
1007 backup_filename[j++] = suffix[i++];
1010 backup_filename[j] = '\0';
1012 j = filename_length + strlen(suffix);
1013 backup_filename = nkf_xmalloc(j + 1);
1014 strcpy(backup_filename, filename);
1015 strcat(backup_filename, suffix);
1016 backup_filename[j] = '\0';
1018 return backup_filename;
1022 #ifdef UTF8_INPUT_ENABLE
1024 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1031 (*f)(0, bin2hex(c>>shift));
1042 encode_fallback_html(nkf_char c)
1047 if(c >= NKF_INT32_C(1000000))
1048 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1049 if(c >= NKF_INT32_C(100000))
1050 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1052 (*oconv)(0, 0x30+(c/10000 )%10);
1054 (*oconv)(0, 0x30+(c/1000 )%10);
1056 (*oconv)(0, 0x30+(c/100 )%10);
1058 (*oconv)(0, 0x30+(c/10 )%10);
1060 (*oconv)(0, 0x30+ c %10);
1066 encode_fallback_xml(nkf_char c)
1071 nkf_each_char_to_hex(oconv, c);
1077 encode_fallback_java(nkf_char c)
1081 if(!nkf_char_unicode_bmp_p(c)){
1085 (*oconv)(0, bin2hex(c>>20));
1086 (*oconv)(0, bin2hex(c>>16));
1090 (*oconv)(0, bin2hex(c>>12));
1091 (*oconv)(0, bin2hex(c>> 8));
1092 (*oconv)(0, bin2hex(c>> 4));
1093 (*oconv)(0, bin2hex(c ));
1098 encode_fallback_perl(nkf_char c)
1103 nkf_each_char_to_hex(oconv, c);
1109 encode_fallback_subchar(nkf_char c)
1111 c = unicode_subchar;
1112 (*oconv)((c>>8)&0xFF, c&0xFF);
1117 static const struct {
1141 {"katakana-hiragana","h3"},
1149 #ifdef UTF8_OUTPUT_ENABLE
1159 {"fb-subchar=", ""},
1161 #ifdef UTF8_INPUT_ENABLE
1162 {"utf8-input", "W"},
1163 {"utf16-input", "W16"},
1164 {"no-cp932ext", ""},
1165 {"no-best-fit-chars",""},
1167 #ifdef UNICODE_NORMALIZATION
1168 {"utf8mac-input", ""},
1180 #ifdef NUMCHAR_OPTION
1181 {"numchar-input", ""},
1187 #ifdef SHIFTJIS_CP932
1198 set_input_encoding(nkf_encoding *enc)
1200 switch (nkf_enc_to_index(enc)) {
1207 #ifdef SHIFTJIS_CP932
1210 #ifdef UTF8_OUTPUT_ENABLE
1211 ms_ucs_map_f = UCS_MAP_CP932;
1221 case ISO_2022_JP_2004:
1228 #ifdef SHIFTJIS_CP932
1231 #ifdef UTF8_OUTPUT_ENABLE
1232 ms_ucs_map_f = UCS_MAP_CP932;
1237 #ifdef SHIFTJIS_CP932
1240 #ifdef UTF8_OUTPUT_ENABLE
1241 ms_ucs_map_f = UCS_MAP_CP10001;
1249 #ifdef SHIFTJIS_CP932
1252 #ifdef UTF8_OUTPUT_ENABLE
1253 ms_ucs_map_f = UCS_MAP_CP932;
1257 #ifdef SHIFTJIS_CP932
1260 #ifdef UTF8_OUTPUT_ENABLE
1261 ms_ucs_map_f = UCS_MAP_MS;
1265 #ifdef SHIFTJIS_CP932
1268 #ifdef UTF8_OUTPUT_ENABLE
1269 ms_ucs_map_f = UCS_MAP_ASCII;
1272 case SHIFT_JISX0213:
1273 case SHIFT_JIS_2004:
1275 #ifdef SHIFTJIS_CP932
1282 #ifdef SHIFTJIS_CP932
1286 #ifdef UTF8_INPUT_ENABLE
1287 #ifdef UNICODE_NORMALIZATION
1295 input_endian = ENDIAN_BIG;
1299 input_endian = ENDIAN_LITTLE;
1304 input_endian = ENDIAN_BIG;
1308 input_endian = ENDIAN_LITTLE;
1315 set_output_encoding(nkf_encoding *enc)
1317 switch (nkf_enc_to_index(enc)) {
1320 #ifdef SHIFTJIS_CP932
1321 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1323 #ifdef UTF8_OUTPUT_ENABLE
1324 ms_ucs_map_f = UCS_MAP_CP932;
1328 #ifdef SHIFTJIS_CP932
1329 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1331 #ifdef UTF8_OUTPUT_ENABLE
1332 ms_ucs_map_f = UCS_MAP_CP932;
1337 #ifdef SHIFTJIS_CP932
1338 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1344 #ifdef SHIFTJIS_CP932
1345 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1351 #ifdef UTF8_OUTPUT_ENABLE
1352 ms_ucs_map_f = UCS_MAP_CP932;
1356 #ifdef UTF8_OUTPUT_ENABLE
1357 ms_ucs_map_f = UCS_MAP_CP10001;
1362 #ifdef SHIFTJIS_CP932
1363 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1365 #ifdef UTF8_OUTPUT_ENABLE
1366 ms_ucs_map_f = UCS_MAP_ASCII;
1371 #ifdef SHIFTJIS_CP932
1372 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1374 #ifdef UTF8_OUTPUT_ENABLE
1375 ms_ucs_map_f = UCS_MAP_ASCII;
1379 #ifdef SHIFTJIS_CP932
1380 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1382 #ifdef UTF8_OUTPUT_ENABLE
1383 ms_ucs_map_f = UCS_MAP_CP932;
1388 #ifdef UTF8_OUTPUT_ENABLE
1389 ms_ucs_map_f = UCS_MAP_MS;
1394 #ifdef UTF8_OUTPUT_ENABLE
1395 ms_ucs_map_f = UCS_MAP_ASCII;
1398 case SHIFT_JISX0213:
1399 case SHIFT_JIS_2004:
1401 #ifdef SHIFTJIS_CP932
1402 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1409 #ifdef SHIFTJIS_CP932
1410 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1413 #ifdef UTF8_OUTPUT_ENABLE
1415 output_bom_f = TRUE;
1419 output_bom_f = TRUE;
1422 output_endian = ENDIAN_LITTLE;
1423 output_bom_f = FALSE;
1426 output_endian = ENDIAN_LITTLE;
1427 output_bom_f = TRUE;
1430 output_bom_f = TRUE;
1433 output_endian = ENDIAN_LITTLE;
1434 output_bom_f = FALSE;
1437 output_endian = ENDIAN_LITTLE;
1438 output_bom_f = TRUE;
1444 static struct input_code*
1445 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1448 struct input_code *p = input_code_list;
1450 if (iconv_func == p->iconv_func){
1460 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1462 #ifdef INPUT_CODE_FIX
1463 if (f || !input_encoding)
1470 #ifdef INPUT_CODE_FIX
1471 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1477 if (estab_f && iconv_for_check != iconv){
1478 struct input_code *p = find_inputcode_byfunc(iconv);
1480 set_input_codename(p->name);
1483 iconv_for_check = iconv;
1490 x0212_shift(nkf_char c)
1495 if (0x75 <= c && c <= 0x7f){
1496 ret = c + (0x109 - 0x75);
1499 if (0x75 <= c && c <= 0x7f){
1500 ret = c + (0x113 - 0x75);
1508 x0212_unshift(nkf_char c)
1511 if (0x7f <= c && c <= 0x88){
1512 ret = c + (0x75 - 0x7f);
1513 }else if (0x89 <= c && c <= 0x92){
1514 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1518 #endif /* X0212_ENABLE */
1521 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1527 if((0x21 <= ndx && ndx <= 0x2F)){
1528 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1529 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1531 }else if(0x6E <= ndx && ndx <= 0x7E){
1532 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1533 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1539 else if(nkf_isgraph(ndx)){
1541 const unsigned short *ptr;
1542 ptr = x0212_shiftjis[ndx - 0x21];
1544 val = ptr[(c1 & 0x7f) - 0x21];
1553 c2 = x0212_shift(c2);
1555 #endif /* X0212_ENABLE */
1557 if(0x7F < c2) return 1;
1558 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1559 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1564 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1566 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1569 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1570 if (0xFC < c1) return 1;
1571 #ifdef SHIFTJIS_CP932
1572 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1573 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1580 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1581 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1587 #endif /* SHIFTJIS_CP932 */
1589 if (!x0213_f && is_ibmext_in_sjis(c2)){
1590 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1593 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1606 if(x0213_f && c2 >= 0xF0){
1607 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1608 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1609 }else{ /* 78<=k<=94 */
1610 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1611 if (0x9E < c1) c2++;
1614 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1615 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1616 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1617 if (0x9E < c1) c2++;
1620 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1627 c2 = x0212_unshift(c2);
1634 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1636 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1644 }else if (val < 0x800){
1645 *p1 = 0xc0 | (val >> 6);
1646 *p2 = 0x80 | (val & 0x3f);
1649 } else if (nkf_char_unicode_bmp_p(val)) {
1650 *p1 = 0xe0 | (val >> 12);
1651 *p2 = 0x80 | ((val >> 6) & 0x3f);
1652 *p3 = 0x80 | ( val & 0x3f);
1654 } else if (nkf_char_unicode_value_p(val)) {
1655 *p1 = 0xe0 | (val >> 16);
1656 *p2 = 0x80 | ((val >> 12) & 0x3f);
1657 *p3 = 0x80 | ((val >> 6) & 0x3f);
1658 *p4 = 0x80 | ( val & 0x3f);
1668 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1675 else if (c1 <= 0xC3) {
1676 /* trail byte or invalid */
1679 else if (c1 <= 0xDF) {
1681 wc = (c1 & 0x1F) << 6;
1684 else if (c1 <= 0xEF) {
1686 wc = (c1 & 0x0F) << 12;
1687 wc |= (c2 & 0x3F) << 6;
1690 else if (c2 <= 0xF4) {
1692 wc = (c1 & 0x0F) << 18;
1693 wc |= (c2 & 0x3F) << 12;
1694 wc |= (c3 & 0x3F) << 6;
1704 #ifdef UTF8_INPUT_ENABLE
1706 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1707 const unsigned short *const *pp, nkf_char psize,
1708 nkf_char *p2, nkf_char *p1)
1711 const unsigned short *p;
1714 if (pp == 0) return 1;
1717 if (c1 < 0 || psize <= c1) return 1;
1719 if (p == 0) return 1;
1722 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1724 if (val == 0) return 1;
1725 if (no_cp932ext_f && (
1726 (val>>8) == 0x2D || /* NEC special characters */
1727 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1735 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1743 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1745 const unsigned short *const *pp;
1746 const unsigned short *const *const *ppp;
1747 static const char no_best_fit_chars_table_C2[] =
1748 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1750 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1751 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1752 static const char no_best_fit_chars_table_C2_ms[] =
1753 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1755 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1756 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1757 static const char no_best_fit_chars_table_932_C2[] =
1758 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1761 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1762 static const char no_best_fit_chars_table_932_C3[] =
1763 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1764 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1765 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1766 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1772 }else if(c2 < 0xe0){
1773 if(no_best_fit_chars_f){
1774 if(ms_ucs_map_f == UCS_MAP_CP932){
1777 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1780 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1783 }else if(!cp932inv_f){
1786 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1789 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1792 }else if(ms_ucs_map_f == UCS_MAP_MS){
1793 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1794 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1812 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1813 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1814 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1816 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1817 }else if(c0 < 0xF0){
1818 if(no_best_fit_chars_f){
1819 if(ms_ucs_map_f == UCS_MAP_CP932){
1820 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1821 }else if(ms_ucs_map_f == UCS_MAP_MS){
1826 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1829 if(c0 == 0x92) return 1;
1834 if(c1 == 0x80 || c0 == 0x9C) return 1;
1837 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1842 if(c0 == 0x94) return 1;
1845 if(c0 == 0xBB) return 1;
1855 if(c0 == 0x95) return 1;
1858 if(c0 == 0xA5) return 1;
1865 if(c0 == 0x8D) return 1;
1868 if(c0 == 0x9E && !cp932inv_f) return 1;
1871 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1879 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1880 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1881 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1883 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1885 #ifdef SHIFTJIS_CP932
1886 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1888 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1889 s2e_conv(s2, s1, p2, p1);
1898 #ifdef UTF8_OUTPUT_ENABLE
1900 e2w_conv(nkf_char c2, nkf_char c1)
1902 const unsigned short *p;
1904 if (c2 == JIS_X_0201_1976_K) {
1905 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1913 p = euc_to_utf8_1byte;
1915 } else if (is_eucg3(c2)){
1916 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1919 c2 = (c2&0x7f) - 0x21;
1920 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1921 p = x0212_to_utf8_2bytes[c2];
1927 c2 = (c2&0x7f) - 0x21;
1928 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1930 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1931 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1932 euc_to_utf8_2bytes_ms[c2];
1937 c1 = (c1 & 0x7f) - 0x21;
1938 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1945 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1952 }else if (0xc0 <= c2 && c2 <= 0xef) {
1953 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1954 #ifdef NUMCHAR_OPTION
1957 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1965 #ifdef UTF8_INPUT_ENABLE
1967 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1969 nkf_char c1, c2, c3, c4;
1976 else if (nkf_char_unicode_bmp_p(val)){
1977 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1978 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1981 *p1 = nkf_char_unicode_new(val);
1987 *p1 = nkf_char_unicode_new(val);
1994 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1996 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1997 if (iso2022jp_f && !x0201_f) {
1998 c2 = GETA1; c1 = GETA2;
2000 c2 = JIS_X_0201_1976_K;
2004 }else if (c2 == 0x8f){
2008 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2009 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2010 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2013 c2 = (c2 << 8) | (c1 & 0x7f);
2015 #ifdef SHIFTJIS_CP932
2018 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2019 s2e_conv(s2, s1, &c2, &c1);
2026 #endif /* SHIFTJIS_CP932 */
2028 #endif /* X0212_ENABLE */
2029 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2032 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2033 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2034 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2039 #ifdef SHIFTJIS_CP932
2040 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2042 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2043 s2e_conv(s2, s1, &c2, &c1);
2050 #endif /* SHIFTJIS_CP932 */
2058 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2060 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2061 if (iso2022jp_f && !x0201_f) {
2062 c2 = GETA1; c1 = GETA2;
2066 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2068 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2070 if(c1 == 0x7F) return 0;
2071 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2074 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2075 if (ret) return ret;
2082 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2084 nkf_char ret = 0, c4 = 0;
2085 static const char w_iconv_utf8_1st_byte[] =
2087 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2088 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2089 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2090 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2097 if (c1 < 0 || 0xff < c1) {
2098 }else if (c1 == 0) { /* 0 : 1 byte*/
2100 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2103 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2105 if (c2 < 0x80 || 0xBF < c2) return 0;
2108 if (c3 == 0) return -1;
2109 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2114 if (c3 == 0) return -1;
2115 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2119 if (c3 == 0) return -1;
2120 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2124 if (c3 == 0) return -2;
2125 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2129 if (c3 == 0) return -2;
2130 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2134 if (c3 == 0) return -2;
2135 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2143 if (c1 == 0 || c1 == EOF){
2144 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2145 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2148 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2156 #define NKF_ICONV_INVALID_CODE_RANGE -13
2158 unicode_iconv(nkf_char wc)
2166 }else if ((wc>>11) == 27) {
2167 /* unpaired surrogate */
2168 return NKF_ICONV_INVALID_CODE_RANGE;
2169 }else if (wc < 0xFFFF) {
2170 ret = w16e_conv(wc, &c2, &c1);
2171 if (ret) return ret;
2172 }else if (wc < 0x10FFFF) {
2174 c1 = nkf_char_unicode_new(wc);
2176 return NKF_ICONV_INVALID_CODE_RANGE;
2182 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2183 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2184 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2186 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2195 if (input_endian == ENDIAN_BIG) {
2196 if (0xD8 <= c1 && c1 <= 0xDB) {
2197 if (0xDC <= c3 && c3 <= 0xDF) {
2198 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2199 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2204 if (0xD8 <= c2 && c2 <= 0xDB) {
2205 if (0xDC <= c4 && c4 <= 0xDF) {
2206 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2207 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2213 return (*unicode_iconv)(wc);
2217 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2223 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2229 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2238 switch(input_endian){
2240 wc = c2 << 16 | c3 << 8 | c4;
2243 wc = c3 << 16 | c2 << 8 | c1;
2246 wc = c1 << 16 | c4 << 8 | c3;
2249 wc = c4 << 16 | c1 << 8 | c2;
2252 return NKF_ICONV_INVALID_CODE_RANGE;
2255 return (*unicode_iconv)(wc);
2259 #define output_ascii_escape_sequence(mode) do { \
2260 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2263 (*o_putc)(ascii_intro); \
2264 output_mode = mode; \
2269 output_escape_sequence(int mode)
2271 if (output_mode == mode)
2279 case JIS_X_0201_1976_K:
2287 (*o_putc)(kanji_intro);
2312 j_oconv(nkf_char c2, nkf_char c1)
2314 #ifdef NUMCHAR_OPTION
2315 if (c2 == 0 && nkf_char_unicode_p(c1)){
2316 w16e_conv(c1, &c2, &c1);
2317 if (c2 == 0 && nkf_char_unicode_p(c1)){
2318 c2 = c1 & VALUE_MASK;
2319 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2322 c2 = 0x7F + c1 / 94;
2323 c1 = 0x21 + c1 % 94;
2325 if (encode_fallback) (*encode_fallback)(c1);
2332 output_ascii_escape_sequence(ASCII);
2335 else if (c2 == EOF) {
2336 output_ascii_escape_sequence(ASCII);
2339 else if (c2 == ISO_8859_1) {
2340 output_ascii_escape_sequence(ISO_8859_1);
2343 else if (c2 == JIS_X_0201_1976_K) {
2344 output_escape_sequence(JIS_X_0201_1976_K);
2347 } else if (is_eucg3(c2)){
2348 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2349 (*o_putc)(c2 & 0x7f);
2354 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2355 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2356 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2363 e_oconv(nkf_char c2, nkf_char c1)
2365 if (c2 == 0 && nkf_char_unicode_p(c1)){
2366 w16e_conv(c1, &c2, &c1);
2367 if (c2 == 0 && nkf_char_unicode_p(c1)){
2368 c2 = c1 & VALUE_MASK;
2369 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2373 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2374 c1 = 0x21 + c1 % 94;
2377 (*o_putc)((c2 & 0x7f) | 0x080);
2378 (*o_putc)(c1 | 0x080);
2380 (*o_putc)((c2 & 0x7f) | 0x080);
2381 (*o_putc)(c1 | 0x080);
2385 if (encode_fallback) (*encode_fallback)(c1);
2393 } else if (c2 == 0) {
2394 output_mode = ASCII;
2396 } else if (c2 == JIS_X_0201_1976_K) {
2397 output_mode = EUC_JP;
2398 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2399 } else if (c2 == ISO_8859_1) {
2400 output_mode = ISO_8859_1;
2401 (*o_putc)(c1 | 0x080);
2403 } else if (is_eucg3(c2)){
2404 output_mode = EUC_JP;
2405 #ifdef SHIFTJIS_CP932
2408 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2409 s2e_conv(s2, s1, &c2, &c1);
2414 output_mode = ASCII;
2416 }else if (is_eucg3(c2)){
2419 (*o_putc)((c2 & 0x7f) | 0x080);
2420 (*o_putc)(c1 | 0x080);
2423 (*o_putc)((c2 & 0x7f) | 0x080);
2424 (*o_putc)(c1 | 0x080);
2428 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2429 set_iconv(FALSE, 0);
2430 return; /* too late to rescue this char */
2432 output_mode = EUC_JP;
2433 (*o_putc)(c2 | 0x080);
2434 (*o_putc)(c1 | 0x080);
2439 s_oconv(nkf_char c2, nkf_char c1)
2441 #ifdef NUMCHAR_OPTION
2442 if (c2 == 0 && nkf_char_unicode_p(c1)){
2443 w16e_conv(c1, &c2, &c1);
2444 if (c2 == 0 && nkf_char_unicode_p(c1)){
2445 c2 = c1 & VALUE_MASK;
2446 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2449 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2451 c1 += 0x40 + (c1 > 0x3e);
2456 if(encode_fallback)(*encode_fallback)(c1);
2465 } else if (c2 == 0) {
2466 output_mode = ASCII;
2468 } else if (c2 == JIS_X_0201_1976_K) {
2469 output_mode = SHIFT_JIS;
2471 } else if (c2 == ISO_8859_1) {
2472 output_mode = ISO_8859_1;
2473 (*o_putc)(c1 | 0x080);
2475 } else if (is_eucg3(c2)){
2476 output_mode = SHIFT_JIS;
2477 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2483 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2484 set_iconv(FALSE, 0);
2485 return; /* too late to rescue this char */
2487 output_mode = SHIFT_JIS;
2488 e2s_conv(c2, c1, &c2, &c1);
2490 #ifdef SHIFTJIS_CP932
2492 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2493 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2499 #endif /* SHIFTJIS_CP932 */
2502 if (prefix_table[(unsigned char)c1]){
2503 (*o_putc)(prefix_table[(unsigned char)c1]);
2509 #ifdef UTF8_OUTPUT_ENABLE
2511 w_oconv(nkf_char c2, nkf_char c1)
2517 output_bom_f = FALSE;
2528 if (c2 == 0 && nkf_char_unicode_p(c1)){
2529 val = c1 & VALUE_MASK;
2530 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2532 if (c2) (*o_putc)(c2);
2533 if (c3) (*o_putc)(c3);
2534 if (c4) (*o_putc)(c4);
2541 val = e2w_conv(c2, c1);
2543 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2545 if (c2) (*o_putc)(c2);
2546 if (c3) (*o_putc)(c3);
2547 if (c4) (*o_putc)(c4);
2553 w_oconv16(nkf_char c2, nkf_char c1)
2556 output_bom_f = FALSE;
2557 if (output_endian == ENDIAN_LITTLE){
2571 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2572 if (nkf_char_unicode_bmp_p(c1)) {
2573 c2 = (c1 >> 8) & 0xff;
2577 if (c1 <= UNICODE_MAX) {
2578 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2579 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2580 if (output_endian == ENDIAN_LITTLE){
2581 (*o_putc)(c2 & 0xff);
2582 (*o_putc)((c2 >> 8) & 0xff);
2583 (*o_putc)(c1 & 0xff);
2584 (*o_putc)((c1 >> 8) & 0xff);
2586 (*o_putc)((c2 >> 8) & 0xff);
2587 (*o_putc)(c2 & 0xff);
2588 (*o_putc)((c1 >> 8) & 0xff);
2589 (*o_putc)(c1 & 0xff);
2595 nkf_char val = e2w_conv(c2, c1);
2596 c2 = (val >> 8) & 0xff;
2601 if (output_endian == ENDIAN_LITTLE){
2611 w_oconv32(nkf_char c2, nkf_char c1)
2614 output_bom_f = FALSE;
2615 if (output_endian == ENDIAN_LITTLE){
2633 if (c2 == ISO_8859_1) {
2635 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2638 c1 = e2w_conv(c2, c1);
2641 if (output_endian == ENDIAN_LITTLE){
2642 (*o_putc)( c1 & 0xFF);
2643 (*o_putc)((c1 >> 8) & 0xFF);
2644 (*o_putc)((c1 >> 16) & 0xFF);
2648 (*o_putc)((c1 >> 16) & 0xFF);
2649 (*o_putc)((c1 >> 8) & 0xFF);
2650 (*o_putc)( c1 & 0xFF);
2655 #define SCORE_L2 (1) /* Kanji Level 2 */
2656 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2657 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2658 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2659 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2660 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */
2661 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2662 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2664 #define SCORE_INIT (SCORE_iMIME)
2666 static const nkf_char score_table_A0[] = {
2669 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2670 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2673 static const nkf_char score_table_F0[] = {
2674 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2675 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2676 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2677 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2681 set_code_score(struct input_code *ptr, nkf_char score)
2684 ptr->score |= score;
2689 clr_code_score(struct input_code *ptr, nkf_char score)
2692 ptr->score &= ~score;
2697 code_score(struct input_code *ptr)
2699 nkf_char c2 = ptr->buf[0];
2700 #ifdef UTF8_OUTPUT_ENABLE
2701 nkf_char c1 = ptr->buf[1];
2704 set_code_score(ptr, SCORE_ERROR);
2705 }else if (c2 == SS2){
2706 set_code_score(ptr, SCORE_KANA);
2707 }else if (c2 == 0x8f){
2708 set_code_score(ptr, SCORE_X0212);
2709 #ifdef UTF8_OUTPUT_ENABLE
2710 }else if (!e2w_conv(c2, c1)){
2711 set_code_score(ptr, SCORE_NO_EXIST);
2713 }else if ((c2 & 0x70) == 0x20){
2714 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2715 }else if ((c2 & 0x70) == 0x70){
2716 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2717 }else if ((c2 & 0x70) >= 0x50){
2718 set_code_score(ptr, SCORE_L2);
2723 status_disable(struct input_code *ptr)
2728 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2732 status_push_ch(struct input_code *ptr, nkf_char c)
2734 ptr->buf[ptr->index++] = c;
2738 status_clear(struct input_code *ptr)
2745 status_reset(struct input_code *ptr)
2748 ptr->score = SCORE_INIT;
2752 status_reinit(struct input_code *ptr)
2755 ptr->_file_stat = 0;
2759 status_check(struct input_code *ptr, nkf_char c)
2761 if (c <= DEL && estab_f){
2767 s_status(struct input_code *ptr, nkf_char c)
2771 status_check(ptr, c);
2776 }else if (nkf_char_unicode_p(c)){
2778 }else if (0xa1 <= c && c <= 0xdf){
2779 status_push_ch(ptr, SS2);
2780 status_push_ch(ptr, c);
2783 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2785 status_push_ch(ptr, c);
2786 }else if (0xed <= c && c <= 0xee){
2788 status_push_ch(ptr, c);
2789 #ifdef SHIFTJIS_CP932
2790 }else if (is_ibmext_in_sjis(c)){
2792 status_push_ch(ptr, c);
2793 #endif /* SHIFTJIS_CP932 */
2795 }else if (0xf0 <= c && c <= 0xfc){
2797 status_push_ch(ptr, c);
2798 #endif /* X0212_ENABLE */
2800 status_disable(ptr);
2804 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2805 status_push_ch(ptr, c);
2806 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2810 status_disable(ptr);
2814 #ifdef SHIFTJIS_CP932
2815 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2816 status_push_ch(ptr, c);
2817 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2818 set_code_score(ptr, SCORE_CP932);
2823 #endif /* SHIFTJIS_CP932 */
2824 status_disable(ptr);
2827 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2828 status_push_ch(ptr, c);
2829 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2830 set_code_score(ptr, SCORE_CP932);
2833 status_disable(ptr);
2840 e_status(struct input_code *ptr, nkf_char c)
2844 status_check(ptr, c);
2849 }else if (nkf_char_unicode_p(c)){
2851 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2853 status_push_ch(ptr, c);
2855 }else if (0x8f == c){
2857 status_push_ch(ptr, c);
2858 #endif /* X0212_ENABLE */
2860 status_disable(ptr);
2864 if (0xa1 <= c && c <= 0xfe){
2865 status_push_ch(ptr, c);
2869 status_disable(ptr);
2874 if (0xa1 <= c && c <= 0xfe){
2876 status_push_ch(ptr, c);
2878 status_disable(ptr);
2880 #endif /* X0212_ENABLE */
2884 #ifdef UTF8_INPUT_ENABLE
2886 w_status(struct input_code *ptr, nkf_char c)
2890 status_check(ptr, c);
2895 }else if (nkf_char_unicode_p(c)){
2897 }else if (0xc0 <= c && c <= 0xdf){
2899 status_push_ch(ptr, c);
2900 }else if (0xe0 <= c && c <= 0xef){
2902 status_push_ch(ptr, c);
2903 }else if (0xf0 <= c && c <= 0xf4){
2905 status_push_ch(ptr, c);
2907 status_disable(ptr);
2912 if (0x80 <= c && c <= 0xbf){
2913 status_push_ch(ptr, c);
2914 if (ptr->index > ptr->stat){
2915 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2916 && ptr->buf[2] == 0xbf);
2917 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2918 &ptr->buf[0], &ptr->buf[1]);
2925 status_disable(ptr);
2929 if (0x80 <= c && c <= 0xbf){
2930 if (ptr->index < ptr->stat){
2931 status_push_ch(ptr, c);
2936 status_disable(ptr);
2944 code_status(nkf_char c)
2946 int action_flag = 1;
2947 struct input_code *result = 0;
2948 struct input_code *p = input_code_list;
2950 if (!p->status_func) {
2954 if (!p->status_func)
2956 (p->status_func)(p, c);
2959 }else if(p->stat == 0){
2970 if (result && !estab_f){
2971 set_iconv(TRUE, result->iconv_func);
2972 }else if (c <= DEL){
2973 struct input_code *ptr = input_code_list;
2983 nkf_buf_t *std_gc_buf;
2984 nkf_char broken_state;
2985 nkf_buf_t *broken_buf;
2986 nkf_char mimeout_state;
2990 static nkf_state_t *nkf_state = NULL;
2992 #define STD_GC_BUFSIZE (256)
2995 nkf_state_init(void)
2998 nkf_buf_clear(nkf_state->std_gc_buf);
2999 nkf_buf_clear(nkf_state->broken_buf);
3000 nkf_buf_clear(nkf_state->nfc_buf);
3003 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3004 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3005 nkf_state->broken_buf = nkf_buf_new(3);
3006 nkf_state->nfc_buf = nkf_buf_new(9);
3008 nkf_state->broken_state = 0;
3009 nkf_state->mimeout_state = 0;
3016 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3017 return nkf_buf_pop(nkf_state->std_gc_buf);
3024 std_ungetc(nkf_char c, FILE *f)
3026 nkf_buf_push(nkf_state->std_gc_buf, c);
3032 std_putc(nkf_char c)
3039 static unsigned char hold_buf[HOLD_SIZE*2];
3040 static int hold_count = 0;
3042 push_hold_buf(nkf_char c2)
3044 if (hold_count >= HOLD_SIZE*2)
3046 hold_buf[hold_count++] = (unsigned char)c2;
3047 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3051 h_conv(FILE *f, int c1, int c2)
3057 /** it must NOT be in the kanji shifte sequence */
3058 /** it must NOT be written in JIS7 */
3059 /** and it must be after 2 byte 8bit code */
3065 while ((c2 = (*i_getc)(f)) != EOF) {
3071 if (push_hold_buf(c2) == EOF || estab_f) {
3077 struct input_code *p = input_code_list;
3078 struct input_code *result = p;
3083 if (p->status_func && p->score < result->score) {
3088 set_iconv(TRUE, result->iconv_func);
3093 ** 1) EOF is detected, or
3094 ** 2) Code is established, or
3095 ** 3) Buffer is FULL (but last word is pushed)
3097 ** in 1) and 3) cases, we continue to use
3098 ** Kanji codes by oconv and leave estab_f unchanged.
3103 while (hold_index < hold_count){
3104 c1 = hold_buf[hold_index++];
3108 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3109 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3112 if (hold_index < hold_count){
3113 c2 = hold_buf[hold_index++];
3123 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3126 if (hold_index < hold_count){
3127 c3 = hold_buf[hold_index++];
3128 } else if ((c3 = (*i_getc)(f)) == EOF) {
3133 if (hold_index < hold_count){
3134 c4 = hold_buf[hold_index++];
3135 } else if ((c4 = (*i_getc)(f)) == EOF) {
3140 (*iconv)(c1, c2, (c3<<8)|c4);
3145 /* 3 bytes EUC or UTF-8 */
3146 if (hold_index < hold_count){
3147 c3 = hold_buf[hold_index++];
3148 } else if ((c3 = (*i_getc)(f)) == EOF) {
3154 (*iconv)(c1, c2, c3);
3157 if (c3 == EOF) break;
3163 * Check and Ignore BOM
3169 switch(c2 = (*i_getc)(f)){
3171 if((c2 = (*i_getc)(f)) == 0x00){
3172 if((c2 = (*i_getc)(f)) == 0xFE){
3173 if((c2 = (*i_getc)(f)) == 0xFF){
3174 if(!input_encoding){
3175 set_iconv(TRUE, w_iconv32);
3177 if (iconv == w_iconv32) {
3178 input_endian = ENDIAN_BIG;
3181 (*i_ungetc)(0xFF,f);
3182 }else (*i_ungetc)(c2,f);
3183 (*i_ungetc)(0xFE,f);
3184 }else if(c2 == 0xFF){
3185 if((c2 = (*i_getc)(f)) == 0xFE){
3186 if(!input_encoding){
3187 set_iconv(TRUE, w_iconv32);
3189 if (iconv == w_iconv32) {
3190 input_endian = ENDIAN_2143;
3193 (*i_ungetc)(0xFF,f);
3194 }else (*i_ungetc)(c2,f);
3195 (*i_ungetc)(0xFF,f);
3196 }else (*i_ungetc)(c2,f);
3197 (*i_ungetc)(0x00,f);
3198 }else (*i_ungetc)(c2,f);
3199 (*i_ungetc)(0x00,f);
3202 if((c2 = (*i_getc)(f)) == 0xBB){
3203 if((c2 = (*i_getc)(f)) == 0xBF){
3204 if(!input_encoding){
3205 set_iconv(TRUE, w_iconv);
3207 if (iconv == w_iconv) {
3210 (*i_ungetc)(0xBF,f);
3211 }else (*i_ungetc)(c2,f);
3212 (*i_ungetc)(0xBB,f);
3213 }else (*i_ungetc)(c2,f);
3214 (*i_ungetc)(0xEF,f);
3217 if((c2 = (*i_getc)(f)) == 0xFF){
3218 if((c2 = (*i_getc)(f)) == 0x00){
3219 if((c2 = (*i_getc)(f)) == 0x00){
3220 if(!input_encoding){
3221 set_iconv(TRUE, w_iconv32);
3223 if (iconv == w_iconv32) {
3224 input_endian = ENDIAN_3412;
3227 (*i_ungetc)(0x00,f);
3228 }else (*i_ungetc)(c2,f);
3229 (*i_ungetc)(0x00,f);
3230 }else (*i_ungetc)(c2,f);
3231 if(!input_encoding){
3232 set_iconv(TRUE, w_iconv16);
3234 if (iconv == w_iconv16) {
3235 input_endian = ENDIAN_BIG;
3238 (*i_ungetc)(0xFF,f);
3239 }else (*i_ungetc)(c2,f);
3240 (*i_ungetc)(0xFE,f);
3243 if((c2 = (*i_getc)(f)) == 0xFE){
3244 if((c2 = (*i_getc)(f)) == 0x00){
3245 if((c2 = (*i_getc)(f)) == 0x00){
3246 if(!input_encoding){
3247 set_iconv(TRUE, w_iconv32);
3249 if (iconv == w_iconv32) {
3250 input_endian = ENDIAN_LITTLE;
3253 (*i_ungetc)(0x00,f);
3254 }else (*i_ungetc)(c2,f);
3255 (*i_ungetc)(0x00,f);
3256 }else (*i_ungetc)(c2,f);
3257 if(!input_encoding){
3258 set_iconv(TRUE, w_iconv16);
3260 if (iconv == w_iconv16) {
3261 input_endian = ENDIAN_LITTLE;
3264 (*i_ungetc)(0xFE,f);
3265 }else (*i_ungetc)(c2,f);
3266 (*i_ungetc)(0xFF,f);
3275 broken_getc(FILE *f)
3279 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3280 return nkf_buf_pop(nkf_state->broken_buf);
3283 if (c=='$' && nkf_state->broken_state != ESC
3284 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3286 nkf_state->broken_state = 0;
3287 if (c1=='@'|| c1=='B') {
3288 nkf_buf_push(nkf_state->broken_buf, c1);
3289 nkf_buf_push(nkf_state->broken_buf, c);
3295 } else if (c=='(' && nkf_state->broken_state != ESC
3296 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3298 nkf_state->broken_state = 0;
3299 if (c1=='J'|| c1=='B') {
3300 nkf_buf_push(nkf_state->broken_buf, c1);
3301 nkf_buf_push(nkf_state->broken_buf, c);
3308 nkf_state->broken_state = c;
3314 broken_ungetc(nkf_char c, FILE *f)
3316 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3317 nkf_buf_push(nkf_state->broken_buf, c);
3322 eol_conv(nkf_char c2, nkf_char c1)
3324 if (guess_f && input_eol != EOF) {
3325 if (c2 == 0 && c1 == LF) {
3326 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3327 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3328 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3330 else if (!input_eol) input_eol = CR;
3331 else if (input_eol != CR) input_eol = EOF;
3333 if (prev_cr || (c2 == 0 && c1 == LF)) {
3335 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3336 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3338 if (c2 == 0 && c1 == CR) prev_cr = CR;
3339 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3343 Return value of fold_conv()
3345 LF add newline and output char
3346 CR add newline and output nothing
3349 1 (or else) normal output
3351 fold state in prev (previous character)
3353 >0x80 Japanese (X0208/X0201)
3358 This fold algorthm does not preserve heading space in a line.
3359 This is the main difference from fmt.
3362 #define char_size(c2,c1) (c2?2:1)
3365 fold_conv(nkf_char c2, nkf_char c1)
3368 nkf_char fold_state;
3370 if (c1== CR && !fold_preserve_f) {
3371 fold_state=0; /* ignore cr */
3372 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3374 fold_state=0; /* ignore cr */
3375 } else if (c1== BS) {
3376 if (f_line>0) f_line--;
3378 } else if (c2==EOF && f_line != 0) { /* close open last line */
3380 } else if ((c1==LF && !fold_preserve_f)
3381 || ((c1==CR||(c1==LF&&f_prev!=CR))
3382 && fold_preserve_f)) {
3384 if (fold_preserve_f) {
3388 } else if ((f_prev == c1 && !fold_preserve_f)
3389 || (f_prev == LF && fold_preserve_f)
3390 ) { /* duplicate newline */
3393 fold_state = LF; /* output two newline */
3399 if (f_prev&0x80) { /* Japanese? */
3401 fold_state = 0; /* ignore given single newline */
3402 } else if (f_prev==SP) {
3406 if (++f_line<=fold_len)
3410 fold_state = CR; /* fold and output nothing */
3414 } else if (c1=='\f') {
3417 fold_state = LF; /* output newline and clear */
3418 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3419 /* X0208 kankaku or ascii space */
3421 fold_state = 0; /* remove duplicate spaces */
3424 if (++f_line<=fold_len)
3425 fold_state = SP; /* output ASCII space only */
3427 f_prev = SP; f_line = 0;
3428 fold_state = CR; /* fold and output nothing */
3432 prev0 = f_prev; /* we still need this one... , but almost done */
3434 if (c2 || c2 == JIS_X_0201_1976_K)
3435 f_prev |= 0x80; /* this is Japanese */
3436 f_line += char_size(c2,c1);
3437 if (f_line<=fold_len) { /* normal case */
3440 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3441 f_line = char_size(c2,c1);
3442 fold_state = LF; /* We can't wait, do fold now */
3443 } else if (c2 == JIS_X_0201_1976_K) {
3444 /* simple kinsoku rules return 1 means no folding */
3445 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3446 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3447 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3448 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3449 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3450 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3451 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3453 fold_state = LF;/* add one new f_line before this character */
3456 fold_state = LF;/* add one new f_line before this character */
3459 /* kinsoku point in ASCII */
3460 if ( c1==')'|| /* { [ ( */
3471 /* just after special */
3472 } else if (!is_alnum(prev0)) {
3473 f_line = char_size(c2,c1);
3475 } else if ((prev0==SP) || /* ignored new f_line */
3476 (prev0==LF)|| /* ignored new f_line */
3477 (prev0&0x80)) { /* X0208 - ASCII */
3478 f_line = char_size(c2,c1);
3479 fold_state = LF;/* add one new f_line before this character */
3481 fold_state = 1; /* default no fold in ASCII */
3485 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3486 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3487 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3488 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3489 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3490 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3491 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3492 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3493 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3494 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3495 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3496 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3497 /* default no fold in kinsoku */
3500 f_line = char_size(c2,c1);
3501 /* add one new f_line before this character */
3504 f_line = char_size(c2,c1);
3506 /* add one new f_line before this character */
3511 /* terminator process */
3512 switch(fold_state) {
3514 OCONV_NEWLINE((*o_fconv));
3520 OCONV_NEWLINE((*o_fconv));
3531 static nkf_char z_prev2=0,z_prev1=0;
3534 z_conv(nkf_char c2, nkf_char c1)
3537 /* if (c2) c1 &= 0x7f; assertion */
3539 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3545 if (z_prev2 == JIS_X_0201_1976_K) {
3546 if (c2 == JIS_X_0201_1976_K) {
3547 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3549 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3551 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3553 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3558 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3560 if (c2 == JIS_X_0201_1976_K) {
3561 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3562 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3567 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3578 if (alpha_f&1 && c2 == 0x23) {
3579 /* JISX0208 Alphabet */
3581 } else if (c2 == 0x21) {
3582 /* JISX0208 Kigou */
3587 } else if (alpha_f&4) {
3592 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3598 if (alpha_f&8 && c2 == 0) {
3600 const char *entity = 0;
3602 case '>': entity = ">"; break;
3603 case '<': entity = "<"; break;
3604 case '\"': entity = """; break;
3605 case '&': entity = "&"; break;
3608 while (*entity) (*o_zconv)(0, *entity++);
3614 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3619 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3623 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3627 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3631 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3635 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3639 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3643 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3647 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3652 (*o_zconv)(JIS_X_0201_1976_K, c);
3655 } else if (c2 == 0x25) {
3656 /* JISX0208 Katakana */
3657 static const int fullwidth_to_halfwidth[] =
3659 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3660 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3661 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3662 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3663 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3664 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3665 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3666 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3667 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3668 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3669 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3670 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3672 if (fullwidth_to_halfwidth[c1-0x20]){
3673 c2 = fullwidth_to_halfwidth[c1-0x20];
3674 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3676 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3686 #define rot13(c) ( \
3688 (c <= 'M') ? (c + 13): \
3689 (c <= 'Z') ? (c - 13): \
3691 (c <= 'm') ? (c + 13): \
3692 (c <= 'z') ? (c - 13): \
3696 #define rot47(c) ( \
3698 ( c <= 'O') ? (c + 47) : \
3699 ( c <= '~') ? (c - 47) : \
3704 rot_conv(nkf_char c2, nkf_char c1)
3706 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3712 (*o_rot_conv)(c2,c1);
3716 hira_conv(nkf_char c2, nkf_char c1)
3720 if (0x20 < c1 && c1 < 0x74) {
3722 (*o_hira_conv)(c2,c1);
3724 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3726 c1 = nkf_char_unicode_new(0x3094);
3727 (*o_hira_conv)(c2,c1);
3730 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3732 (*o_hira_conv)(c2,c1);
3737 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3740 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3742 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3746 (*o_hira_conv)(c2,c1);
3751 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3753 #define RANGE_NUM_MAX 18
3754 static const nkf_char range[RANGE_NUM_MAX][2] = {
3775 nkf_char start, end, c;
3777 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3781 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3786 for (i = 0; i < RANGE_NUM_MAX; i++) {
3787 start = range[i][0];
3790 if (c >= start && c <= end) {
3795 (*o_iso2022jp_check_conv)(c2,c1);
3799 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3801 static const unsigned char *mime_pattern[] = {
3802 (const unsigned char *)"\075?EUC-JP?B?",
3803 (const unsigned char *)"\075?SHIFT_JIS?B?",
3804 (const unsigned char *)"\075?ISO-8859-1?Q?",
3805 (const unsigned char *)"\075?ISO-8859-1?B?",
3806 (const unsigned char *)"\075?ISO-2022-JP?B?",
3807 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3808 #if defined(UTF8_INPUT_ENABLE)
3809 (const unsigned char *)"\075?UTF-8?B?",
3810 (const unsigned char *)"\075?UTF-8?Q?",
3812 (const unsigned char *)"\075?US-ASCII?Q?",
3817 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3818 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3819 e_iconv, s_iconv, 0, 0, 0, 0,
3820 #if defined(UTF8_INPUT_ENABLE)
3826 static const nkf_char mime_encode[] = {
3827 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3828 #if defined(UTF8_INPUT_ENABLE)
3835 static const nkf_char mime_encode_method[] = {
3836 'B', 'B','Q', 'B', 'B', 'Q',
3837 #if defined(UTF8_INPUT_ENABLE)
3845 /* MIME preprocessor fifo */
3847 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3848 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3849 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3851 unsigned char buf[MIME_BUF_SIZE];
3853 unsigned int last; /* decoded */
3854 unsigned int input; /* undecoded */
3856 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3858 #define MAXRECOVER 20
3861 mime_input_buf_unshift(nkf_char c)
3863 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3867 mime_ungetc(nkf_char c, FILE *f)
3869 mime_input_buf_unshift(c);
3874 mime_ungetc_buf(nkf_char c, FILE *f)
3877 (*i_mungetc_buf)(c,f);
3879 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3884 mime_getc_buf(FILE *f)
3886 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3887 a terminator. It was checked in mime_integrity. */
3888 return ((mimebuf_f)?
3889 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3893 switch_mime_getc(void)
3895 if (i_getc!=mime_getc) {
3896 i_mgetc = i_getc; i_getc = mime_getc;
3897 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3898 if(mime_f==STRICT_MIME) {
3899 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3900 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3906 unswitch_mime_getc(void)
3908 if(mime_f==STRICT_MIME) {
3909 i_mgetc = i_mgetc_buf;
3910 i_mungetc = i_mungetc_buf;
3913 i_ungetc = i_mungetc;
3914 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3915 mime_iconv_back = NULL;
3919 mime_integrity(FILE *f, const unsigned char *p)
3923 /* In buffered mode, read until =? or NL or buffer full
3925 mime_input_state.input = mime_input_state.top;
3926 mime_input_state.last = mime_input_state.top;
3928 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3930 q = mime_input_state.input;
3931 while((c=(*i_getc)(f))!=EOF) {
3932 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3933 break; /* buffer full */
3935 if (c=='=' && d=='?') {
3936 /* checked. skip header, start decode */
3937 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3938 /* mime_last_input = mime_input_state.input; */
3939 mime_input_state.input = q;
3943 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3945 /* Should we check length mod 4? */
3946 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3949 /* In case of Incomplete MIME, no MIME decode */
3950 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3951 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3952 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3953 switch_mime_getc(); /* anyway we need buffered getc */
3958 mime_begin_strict(FILE *f)
3962 const unsigned char *p,*q;
3963 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3965 mime_decode_mode = FALSE;
3966 /* =? has been checked */
3968 p = mime_pattern[j];
3971 for(i=2;p[i]>SP;i++) { /* start at =? */
3972 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3973 /* pattern fails, try next one */
3975 while (mime_pattern[++j]) {
3976 p = mime_pattern[j];
3977 for(k=2;k<i;k++) /* assume length(p) > i */
3978 if (p[k]!=q[k]) break;
3979 if (k==i && nkf_toupper(c1)==p[k]) break;
3981 p = mime_pattern[j];
3982 if (p) continue; /* found next one, continue */
3983 /* all fails, output from recovery buffer */
3991 mime_decode_mode = p[i-2];
3993 mime_iconv_back = iconv;
3994 set_iconv(FALSE, mime_priority_func[j]);
3995 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3997 if (mime_decode_mode=='B') {
3998 mimebuf_f = unbuf_f;
4000 /* do MIME integrity check */
4001 return mime_integrity(f,mime_pattern[j]);
4015 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4016 /* re-read and convert again from mime_buffer. */
4018 /* =? has been checked */
4019 k = mime_input_state.last;
4020 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4021 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4022 /* We accept any character type even if it is breaked by new lines */
4023 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4024 if (c1==LF||c1==SP||c1==CR||
4025 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4027 /* Failed. But this could be another MIME preemble */
4029 mime_input_state.last--;
4035 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4036 if (!(++i<MAXRECOVER) || c1==EOF) break;
4037 if (c1=='b'||c1=='B') {
4038 mime_decode_mode = 'B';
4039 } else if (c1=='q'||c1=='Q') {
4040 mime_decode_mode = 'Q';
4044 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4045 if (!(++i<MAXRECOVER) || c1==EOF) break;
4047 mime_decode_mode = FALSE;
4053 if (!mime_decode_mode) {
4054 /* false MIME premble, restart from mime_buffer */
4055 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4056 /* Since we are in MIME mode until buffer becomes empty, */
4057 /* we never go into mime_begin again for a while. */
4060 /* discard mime preemble, and goto MIME mode */
4061 mime_input_state.last = k;
4062 /* do no MIME integrity check */
4063 return c1; /* used only for checking EOF */
4074 debug(const char *str)
4077 fprintf(stderr, "%s\n", str ? str : "NULL");
4083 set_input_codename(const char *codename)
4085 if (!input_codename) {
4086 input_codename = codename;
4087 } else if (strcmp(codename, input_codename) != 0) {
4088 input_codename = "";
4093 get_guessed_code(void)
4095 if (input_codename && !*input_codename) {
4096 input_codename = "BINARY";
4098 struct input_code *p = find_inputcode_byfunc(iconv);
4099 if (!input_codename) {
4100 input_codename = "ASCII";
4101 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4102 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4103 input_codename = "CP932";
4104 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4105 if (p->score & (SCORE_X0212))
4106 input_codename = "EUCJP-MS";
4107 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4108 input_codename = "CP51932";
4109 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4110 if (p->score & (SCORE_KANA))
4111 input_codename = "CP50221";
4112 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4113 input_codename = "CP50220";
4116 return input_codename;
4119 #if !defined(PERL_XS) && !defined(WIN32DLL)
4121 print_guessed_code(char *filename)
4123 if (filename != NULL) printf("%s: ", filename);
4124 if (input_codename && !*input_codename) {
4127 input_codename = get_guessed_code();
4129 printf("%s\n", input_codename);
4133 input_eol == CR ? " (CR)" :
4134 input_eol == LF ? " (LF)" :
4135 input_eol == CRLF ? " (CRLF)" :
4136 input_eol == EOF ? " (MIXED NL)" :
4146 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4148 nkf_char c1, c2, c3;
4154 if (!nkf_isxdigit(c2)){
4159 if (!nkf_isxdigit(c3)){
4164 return (hex2bin(c2) << 4) | hex2bin(c3);
4170 return hex_getc(':', f, i_cgetc, i_cungetc);
4174 cap_ungetc(nkf_char c, FILE *f)
4176 return (*i_cungetc)(c, f);
4182 return hex_getc('%', f, i_ugetc, i_uungetc);
4186 url_ungetc(nkf_char c, FILE *f)
4188 return (*i_uungetc)(c, f);
4192 #ifdef NUMCHAR_OPTION
4194 numchar_getc(FILE *f)
4196 nkf_char (*g)(FILE *) = i_ngetc;
4197 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4208 if (buf[i] == 'x' || buf[i] == 'X'){
4209 for (j = 0; j < 7; j++){
4211 if (!nkf_isxdigit(buf[i])){
4218 c |= hex2bin(buf[i]);
4221 for (j = 0; j < 8; j++){
4225 if (!nkf_isdigit(buf[i])){
4232 c += hex2bin(buf[i]);
4238 return nkf_char_unicode_new(c);
4248 numchar_ungetc(nkf_char c, FILE *f)
4250 return (*i_nungetc)(c, f);
4254 #ifdef UNICODE_NORMALIZATION
4259 nkf_char (*g)(FILE *f) = i_nfc_getc;
4260 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4261 nkf_buf_t *buf = nkf_state->nfc_buf;
4262 const unsigned char *array;
4263 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4264 nkf_char c = (*g)(f);
4266 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4268 nkf_buf_push(buf, c);
4270 while (lower <= upper) {
4271 int mid = (lower+upper) / 2;
4273 array = normalization_table[mid].nfd;
4274 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4275 if (len >= nkf_buf_length(buf)) {
4279 lower = 1, upper = 0;
4282 nkf_buf_push(buf, c);
4284 if (array[len] != nkf_buf_at(buf, len)) {
4285 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4286 else upper = mid - 1;
4293 array = normalization_table[mid].nfc;
4295 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4296 nkf_buf_push(buf, array[i]);
4300 } while (lower <= upper);
4302 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4303 c = nkf_buf_pop(buf);
4309 nfc_ungetc(nkf_char c, FILE *f)
4311 return (*i_nfc_ungetc)(c, f);
4313 #endif /* UNICODE_NORMALIZATION */
4317 base64decode(nkf_char c)
4322 i = c - 'A'; /* A..Z 0-25 */
4323 } else if (c == '_') {
4324 i = '?' /* 63 */ ; /* _ 63 */
4326 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4328 } else if (c > '/') {
4329 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4330 } else if (c == '+' || c == '-') {
4331 i = '>' /* 62 */ ; /* + and - 62 */
4333 i = '?' /* 63 */ ; /* / 63 */
4341 nkf_char c1, c2, c3, c4, cc;
4342 nkf_char t1, t2, t3, t4, mode, exit_mode;
4343 nkf_char lwsp_count;
4346 nkf_char lwsp_size = 128;
4348 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4349 return mime_input_buf(mime_input_state.top++);
4351 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4352 mime_decode_mode=FALSE;
4353 unswitch_mime_getc();
4354 return (*i_getc)(f);
4357 if (mimebuf_f == FIXED_MIME)
4358 exit_mode = mime_decode_mode;
4361 if (mime_decode_mode == 'Q') {
4362 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4364 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4365 if (c1<=SP || DEL<=c1) {
4366 mime_decode_mode = exit_mode; /* prepare for quit */
4369 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4373 mime_decode_mode = exit_mode; /* prepare for quit */
4374 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4375 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4376 /* end Q encoding */
4377 input_mode = exit_mode;
4379 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4380 while ((c1=(*i_getc)(f))!=EOF) {
4385 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4393 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4394 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4409 lwsp_buf[lwsp_count] = (unsigned char)c1;
4410 if (lwsp_count++>lwsp_size){
4412 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4413 lwsp_buf = lwsp_buf_new;
4419 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4421 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4422 i_ungetc(lwsp_buf[lwsp_count],f);
4425 nkf_xfree(lwsp_buf);
4428 if (c1=='='&&c2<SP) { /* this is soft wrap */
4429 while((c1 = (*i_mgetc)(f)) <=SP) {
4430 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4432 mime_decode_mode = 'Q'; /* still in MIME */
4433 goto restart_mime_q;
4436 mime_decode_mode = 'Q'; /* still in MIME */
4440 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4441 if (c2<=SP) return c2;
4442 mime_decode_mode = 'Q'; /* still in MIME */
4443 return ((hex2bin(c2)<<4) + hex2bin(c3));
4446 if (mime_decode_mode != 'B') {
4447 mime_decode_mode = FALSE;
4448 return (*i_mgetc)(f);
4452 /* Base64 encoding */
4454 MIME allows line break in the middle of
4455 Base64, but we are very pessimistic in decoding
4456 in unbuf mode because MIME encoded code may broken by
4457 less or editor's control sequence (such as ESC-[-K in unbuffered
4458 mode. ignore incomplete MIME.
4460 mode = mime_decode_mode;
4461 mime_decode_mode = exit_mode; /* prepare for quit */
4463 while ((c1 = (*i_mgetc)(f))<=SP) {
4468 if ((c2 = (*i_mgetc)(f))<=SP) {
4471 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4472 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4475 if ((c1 == '?') && (c2 == '=')) {
4478 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4479 while ((c1=(*i_getc)(f))!=EOF) {
4484 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4492 if ((c1=(*i_getc)(f))!=EOF) {
4496 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4511 lwsp_buf[lwsp_count] = (unsigned char)c1;
4512 if (lwsp_count++>lwsp_size){
4514 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4515 lwsp_buf = lwsp_buf_new;
4521 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4523 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4524 i_ungetc(lwsp_buf[lwsp_count],f);
4527 nkf_xfree(lwsp_buf);
4531 if ((c3 = (*i_mgetc)(f))<=SP) {
4534 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4535 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4539 if ((c4 = (*i_mgetc)(f))<=SP) {
4542 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4543 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4547 mime_decode_mode = mode; /* still in MIME sigh... */
4549 /* BASE 64 decoding */
4551 t1 = 0x3f & base64decode(c1);
4552 t2 = 0x3f & base64decode(c2);
4553 t3 = 0x3f & base64decode(c3);
4554 t4 = 0x3f & base64decode(c4);
4555 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4557 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4558 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4560 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4561 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4563 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4568 return mime_input_buf(mime_input_state.top++);
4571 static const char basis_64[] =
4572 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4574 #define MIMEOUT_BUF_LENGTH 74
4576 char buf[MIMEOUT_BUF_LENGTH+1];
4580 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4583 open_mime(nkf_char mode)
4585 const unsigned char *p;
4588 p = mime_pattern[0];
4589 for(i=0;mime_pattern[i];i++) {
4590 if (mode == mime_encode[i]) {
4591 p = mime_pattern[i];
4595 mimeout_mode = mime_encode_method[i];
4597 if (base64_count>45) {
4598 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4599 (*o_mputc)(mimeout_state.buf[i]);
4602 PUT_NEWLINE((*o_mputc));
4605 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
4609 for (;i<mimeout_state.count;i++) {
4610 if (nkf_isspace(mimeout_state.buf[i])) {
4611 (*o_mputc)(mimeout_state.buf[i]);
4621 j = mimeout_state.count;
4622 mimeout_state.count = 0;
4624 mime_putc(mimeout_state.buf[i]);
4629 mime_prechar(nkf_char c2, nkf_char c1)
4631 if (mimeout_mode > 0){
4633 if (base64_count + mimeout_state.count/3*4> 73){
4634 (*o_base64conv)(EOF,0);
4635 OCONV_NEWLINE((*o_base64conv));
4636 (*o_base64conv)(0,SP);
4640 if (base64_count + mimeout_state.count/3*4> 66) {
4641 (*o_base64conv)(EOF,0);
4642 OCONV_NEWLINE((*o_base64conv));
4643 (*o_base64conv)(0,SP);
4649 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4650 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4651 open_mime(output_mode);
4652 (*o_base64conv)(EOF,0);
4653 OCONV_NEWLINE((*o_base64conv));
4654 (*o_base64conv)(0,SP);
4673 switch(mimeout_mode) {
4678 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
4684 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
4689 if (mimeout_mode > 0) {
4690 if (mimeout_f!=FIXED_MIME) {
4692 } else if (mimeout_mode != 'Q')
4698 mimeout_addchar(nkf_char c)
4700 switch(mimeout_mode) {
4705 } else if(!nkf_isalnum(c)) {
4707 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4708 (*o_mputc)(bin2hex((c&0xf)));
4716 nkf_state->mimeout_state=c;
4717 (*o_mputc)(basis_64[c>>2]);
4722 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4723 nkf_state->mimeout_state=c;
4728 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4729 (*o_mputc)(basis_64[c & 0x3F]);
4741 mime_putc(nkf_char c)
4746 if (mimeout_f == FIXED_MIME){
4747 if (mimeout_mode == 'Q'){
4748 if (base64_count > 71){
4749 if (c!=CR && c!=LF) {
4751 PUT_NEWLINE((*o_mputc));
4756 if (base64_count > 71){
4758 PUT_NEWLINE((*o_mputc));
4761 if (c == EOF) { /* c==EOF */
4765 if (c != EOF) { /* c==EOF */
4771 /* mimeout_f != FIXED_MIME */
4773 if (c == EOF) { /* c==EOF */
4774 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4775 j = mimeout_state.count;
4776 mimeout_state.count = 0;
4778 if (mimeout_mode > 0) {
4779 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4781 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4784 mimeout_addchar(mimeout_state.buf[i]);
4788 mimeout_addchar(mimeout_state.buf[i]);
4792 mimeout_addchar(mimeout_state.buf[i]);
4798 mimeout_addchar(mimeout_state.buf[i]);
4804 if (mimeout_state.count > 0){
4805 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4810 if (mimeout_mode=='Q') {
4811 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4812 if (c == CR || c == LF) {
4817 } else if (c <= SP) {
4819 if (base64_count > 70) {
4820 PUT_NEWLINE((*o_mputc));
4823 if (!nkf_isblank(c)) {
4828 if (base64_count > 70) {
4830 PUT_NEWLINE((*o_mputc));
4833 open_mime(output_mode);
4835 if (!nkf_noescape_mime(c)) {
4846 if (mimeout_mode <= 0) {
4847 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4848 if (nkf_isspace(c)) {
4850 if (mimeout_mode == -1) {
4853 if (c==CR || c==LF) {
4855 open_mime(output_mode);
4861 for (i=0;i<mimeout_state.count;i++) {
4862 (*o_mputc)(mimeout_state.buf[i]);
4863 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4874 mimeout_state.buf[0] = (char)c;
4875 mimeout_state.count = 1;
4877 if (base64_count > 1
4878 && base64_count + mimeout_state.count > 76
4879 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4880 static const char *str = "boundary=\"";
4881 static int len = 10;
4884 for (; i < mimeout_state.count - len; ++i) {
4885 if (!strncmp(mimeout_state.buf+i, str, len)) {
4891 if (i == 0 || i == mimeout_state.count - len) {
4892 PUT_NEWLINE((*o_mputc));
4894 if (!nkf_isspace(mimeout_state.buf[0])){
4901 for (j = 0; j <= i; ++j) {
4902 (*o_mputc)(mimeout_state.buf[j]);
4904 PUT_NEWLINE((*o_mputc));
4906 for (; j <= mimeout_state.count; ++j) {
4907 mimeout_state.buf[j - i] = mimeout_state.buf[j];
4909 mimeout_state.count -= i;
4912 mimeout_state.buf[mimeout_state.count++] = (char)c;
4913 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4914 open_mime(output_mode);
4919 if (lastchar==CR || lastchar == LF){
4920 for (i=0;i<mimeout_state.count;i++) {
4921 (*o_mputc)(mimeout_state.buf[i]);
4924 mimeout_state.count = 0;
4927 for (i=0;i<mimeout_state.count-1;i++) {
4928 (*o_mputc)(mimeout_state.buf[i]);
4931 mimeout_state.buf[0] = SP;
4932 mimeout_state.count = 1;
4934 open_mime(output_mode);
4937 /* mimeout_mode == 'B', 1, 2 */
4938 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4939 if (lastchar == CR || lastchar == LF){
4940 if (nkf_isblank(c)) {
4941 for (i=0;i<mimeout_state.count;i++) {
4942 mimeout_addchar(mimeout_state.buf[i]);
4944 mimeout_state.count = 0;
4945 } else if (SP<c && c<DEL) {
4947 for (i=0;i<mimeout_state.count;i++) {
4948 (*o_mputc)(mimeout_state.buf[i]);
4951 mimeout_state.count = 0;
4953 mimeout_state.buf[mimeout_state.count++] = (char)c;
4956 if (nkf_isspace(c)) {
4957 for (i=0;i<mimeout_state.count;i++) {
4958 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4960 for (i=0;i<mimeout_state.count;i++) {
4961 (*o_mputc)(mimeout_state.buf[i]);
4964 mimeout_state.count = 0;
4967 mimeout_state.buf[mimeout_state.count++] = (char)c;
4968 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4970 for (i=0;i<mimeout_state.count;i++) {
4971 (*o_mputc)(mimeout_state.buf[i]);
4974 mimeout_state.count = 0;
4978 if (mimeout_state.count>0 && SP<c && c!='=') {
4979 mimeout_state.buf[mimeout_state.count++] = (char)c;
4980 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4981 j = mimeout_state.count;
4982 mimeout_state.count = 0;
4984 mimeout_addchar(mimeout_state.buf[i]);
4991 if (mimeout_state.count>0) {
4992 j = mimeout_state.count;
4993 mimeout_state.count = 0;
4995 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4997 mimeout_addchar(mimeout_state.buf[i]);
5003 (*o_mputc)(mimeout_state.buf[i]);
5005 open_mime(output_mode);
5012 base64_conv(nkf_char c2, nkf_char c1)
5014 mime_prechar(c2, c1);
5015 (*o_base64conv)(c2,c1);
5019 typedef struct nkf_iconv_t {
5022 size_t input_buffer_size;
5023 char *output_buffer;
5024 size_t output_buffer_size;
5028 nkf_iconv_new(char *tocode, char *fromcode)
5030 nkf_iconv_t converter;
5032 converter->input_buffer_size = IOBUF_SIZE;
5033 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5034 converter->output_buffer_size = IOBUF_SIZE * 2;
5035 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5036 converter->cd = iconv_open(tocode, fromcode);
5037 if (converter->cd == (iconv_t)-1)
5041 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5044 perror("can't iconv_open");
5050 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5052 size_t invalid = (size_t)0;
5053 char *input_buffer = converter->input_buffer;
5054 size_t input_length = (size_t)0;
5055 char *output_buffer = converter->output_buffer;
5056 size_t output_length = converter->output_buffer_size;
5061 while ((c = (*i_getc)(f)) != EOF) {
5062 input_buffer[input_length++] = c;
5063 if (input_length < converter->input_buffer_size) break;
5067 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5068 while (output_length-- > 0) {
5069 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5071 if (ret == (size_t) - 1) {
5074 if (input_buffer != converter->input_buffer)
5075 memmove(converter->input_buffer, input_buffer, input_length);
5078 converter->output_buffer_size *= 2;
5079 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5080 if (output_buffer == NULL) {
5081 perror("can't realloc");
5084 converter->output_buffer = output_buffer;
5087 perror("can't iconv");
5100 nkf_iconv_close(nkf_iconv_t *convert)
5102 nkf_xfree(converter->inbuf);
5103 nkf_xfree(converter->outbuf);
5104 iconv_close(converter->cd);
5113 struct input_code *p = input_code_list;
5125 mime_f = MIME_DECODE_DEFAULT;
5126 mime_decode_f = FALSE;
5131 x0201_f = X0201_DEFAULT;
5132 iso2022jp_f = FALSE;
5133 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5134 ms_ucs_map_f = UCS_MAP_ASCII;
5136 #ifdef UTF8_INPUT_ENABLE
5137 no_cp932ext_f = FALSE;
5138 no_best_fit_chars_f = FALSE;
5139 encode_fallback = NULL;
5140 unicode_subchar = '?';
5141 input_endian = ENDIAN_BIG;
5143 #ifdef UTF8_OUTPUT_ENABLE
5144 output_bom_f = FALSE;
5145 output_endian = ENDIAN_BIG;
5147 #ifdef UNICODE_NORMALIZATION
5163 #ifdef SHIFTJIS_CP932
5173 for (i = 0; i < 256; i++){
5174 prefix_table[i] = 0;
5178 mimeout_state.count = 0;
5183 fold_preserve_f = FALSE;
5186 kanji_intro = DEFAULT_J;
5187 ascii_intro = DEFAULT_R;
5188 fold_margin = FOLD_MARGIN;
5189 o_zconv = no_connection;
5190 o_fconv = no_connection;
5191 o_eol_conv = no_connection;
5192 o_rot_conv = no_connection;
5193 o_hira_conv = no_connection;
5194 o_base64conv = no_connection;
5195 o_iso2022jp_check_conv = no_connection;
5198 i_ungetc = std_ungetc;
5200 i_bungetc = std_ungetc;
5203 i_mungetc = std_ungetc;
5204 i_mgetc_buf = std_getc;
5205 i_mungetc_buf = std_ungetc;
5206 output_mode = ASCII;
5208 mime_decode_mode = FALSE;
5214 z_prev2=0,z_prev1=0;
5216 iconv_for_check = 0;
5218 input_codename = NULL;
5219 input_encoding = NULL;
5220 output_encoding = NULL;
5228 module_connection(void)
5230 if (input_encoding) set_input_encoding(input_encoding);
5231 if (!output_encoding) {
5232 output_encoding = nkf_default_encoding();
5234 if (!output_encoding) {
5235 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5238 set_output_encoding(output_encoding);
5239 oconv = nkf_enc_to_oconv(output_encoding);
5242 /* replace continucation module, from output side */
5244 /* output redicrection */
5246 if (noout_f || guess_f){
5253 if (mimeout_f == TRUE) {
5254 o_base64conv = oconv; oconv = base64_conv;
5256 /* base64_count = 0; */
5259 if (eolmode_f || guess_f) {
5260 o_eol_conv = oconv; oconv = eol_conv;
5263 o_rot_conv = oconv; oconv = rot_conv;
5266 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5269 o_hira_conv = oconv; oconv = hira_conv;
5272 o_fconv = oconv; oconv = fold_conv;
5275 if (alpha_f || x0201_f) {
5276 o_zconv = oconv; oconv = z_conv;
5280 i_ungetc = std_ungetc;
5281 /* input redicrection */
5284 i_cgetc = i_getc; i_getc = cap_getc;
5285 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5288 i_ugetc = i_getc; i_getc = url_getc;
5289 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5292 #ifdef NUMCHAR_OPTION
5294 i_ngetc = i_getc; i_getc = numchar_getc;
5295 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5298 #ifdef UNICODE_NORMALIZATION
5300 i_nfc_getc = i_getc; i_getc = nfc_getc;
5301 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5304 if (mime_f && mimebuf_f==FIXED_MIME) {
5305 i_mgetc = i_getc; i_getc = mime_getc;
5306 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5309 i_bgetc = i_getc; i_getc = broken_getc;
5310 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5312 if (input_encoding) {
5313 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5315 set_iconv(FALSE, e_iconv);
5319 struct input_code *p = input_code_list;
5328 Conversion main loop. Code detection only.
5331 #if !defined(PERL_XS) && !defined(WIN32DLL)
5338 module_connection();
5339 while ((c = (*i_getc)(f)) != EOF)
5346 #define NEXT continue /* no output, get next */
5347 #define SKIP c2=0;continue /* no output, get next */
5348 #define MORE c2=c1;continue /* need one more byte */
5349 #define SEND ; /* output c1 and c2, get next */
5350 #define LAST break /* end of loop, go closing */
5351 #define set_input_mode(mode) do { \
5352 input_mode = mode; \
5354 set_input_codename("ISO-2022-JP"); \
5355 debug("ISO-2022-JP"); \
5359 kanji_convert(FILE *f)
5361 nkf_char c1=0, c2=0, c3=0, c4=0;
5362 int shift_mode = 0; /* 0, 1, 2, 3 */
5364 int is_8bit = FALSE;
5366 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5371 output_mode = ASCII;
5373 if (module_connection() < 0) {
5374 #if !defined(PERL_XS) && !defined(WIN32DLL)
5375 fprintf(stderr, "no output encoding given\n");
5381 #ifdef UTF8_INPUT_ENABLE
5382 if(iconv == w_iconv32){
5383 while ((c1 = (*i_getc)(f)) != EOF &&
5384 (c2 = (*i_getc)(f)) != EOF &&
5385 (c3 = (*i_getc)(f)) != EOF &&
5386 (c4 = (*i_getc)(f)) != EOF) {
5387 nkf_iconv_utf_32(c1, c2, c3, c4);
5389 (*i_ungetc)(EOF, f);
5391 else if (iconv == w_iconv16) {
5392 while ((c1 = (*i_getc)(f)) != EOF &&
5393 (c2 = (*i_getc)(f)) != EOF) {
5394 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5395 (c3 = (*i_getc)(f)) != EOF &&
5396 (c4 = (*i_getc)(f)) != EOF) {
5397 nkf_iconv_utf_16(c1, c2, c3, c4);
5400 (*i_ungetc)(EOF, f);
5404 while ((c1 = (*i_getc)(f)) != EOF) {
5405 #ifdef INPUT_CODE_FIX
5406 if (!input_encoding)
5412 /* in case of 8th bit is on */
5413 if (!estab_f&&!mime_decode_mode) {
5414 /* in case of not established yet */
5415 /* It is still ambiguious */
5416 if (h_conv(f, c2, c1)==EOF) {
5424 /* in case of already established */
5426 /* ignore bogus code */
5434 /* 2nd byte of 7 bit code or SJIS */
5438 else if (nkf_char_unicode_p(c1)) {
5444 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5447 } else if (c1 > DEL) {
5449 if (!estab_f && !iso8859_f) {
5450 /* not established yet */
5452 } else { /* estab_f==TRUE */
5458 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5459 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5461 c2 = JIS_X_0201_1976_K;
5466 /* already established */
5470 } else if (SP < c1 && c1 < DEL) {
5471 /* in case of Roman characters */
5473 /* output 1 shifted byte */
5477 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5478 /* output 1 shifted byte */
5479 c2 = JIS_X_0201_1976_K;
5482 /* look like bogus code */
5485 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5486 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5487 /* in case of Kanji shifted */
5489 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5490 /* Check MIME code */
5491 if ((c1 = (*i_getc)(f)) == EOF) {
5494 } else if (c1 == '?') {
5495 /* =? is mime conversion start sequence */
5496 if(mime_f == STRICT_MIME) {
5497 /* check in real detail */
5498 if (mime_begin_strict(f) == EOF)
5501 } else if (mime_begin(f) == EOF)
5510 /* normal ASCII code */
5513 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5516 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5519 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5520 if ((c1 = (*i_getc)(f)) == EOF) {
5521 /* (*oconv)(0, ESC); don't send bogus code */
5524 else if (c1 == '&') {
5526 if ((c1 = (*i_getc)(f)) == EOF) {
5532 else if (c1 == '$') {
5534 if ((c1 = (*i_getc)(f)) == EOF) {
5535 /* don't send bogus code
5537 (*oconv)(0, '$'); */
5539 } else if (c1 == '@' || c1 == 'B') {
5541 set_input_mode(JIS_X_0208);
5543 } else if (c1 == '(') {
5545 if ((c1 = (*i_getc)(f)) == EOF) {
5546 /* don't send bogus code
5552 } else if (c1 == '@'|| c1 == 'B') {
5554 set_input_mode(JIS_X_0208);
5557 } else if (c1 == 'D'){
5558 set_input_mode(JIS_X_0212);
5560 #endif /* X0212_ENABLE */
5561 } else if (c1 == 'O' || c1 == 'Q'){
5562 set_input_mode(JIS_X_0213_1);
5564 } else if (c1 == 'P'){
5565 set_input_mode(JIS_X_0213_2);
5568 /* could be some special code */
5575 } else if (broken_f&0x2) {
5576 /* accept any ESC-(-x as broken code ... */
5577 input_mode = JIS_X_0208;
5586 } else if (c1 == '(') {
5588 if ((c1 = (*i_getc)(f)) == EOF) {
5589 /* don't send bogus code
5591 (*oconv)(0, '('); */
5594 else if (c1 == 'I') {
5595 /* JIS X 0201 Katakana */
5596 set_input_mode(JIS_X_0201_1976_K);
5599 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5600 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5601 set_input_mode(ASCII);
5604 else if (broken_f&0x2) {
5605 set_input_mode(ASCII);
5614 else if (c1 == '.') {
5616 if ((c1 = (*i_getc)(f)) == EOF) {
5619 else if (c1 == 'A') {
5630 else if (c1 == 'N') {
5633 if (g2 == ISO_8859_1) {
5648 } else if (c1 == ESC && iconv == s_iconv) {
5649 /* ESC in Shift_JIS */
5650 if ((c1 = (*i_getc)(f)) == EOF) {
5651 /* (*oconv)(0, ESC); don't send bogus code */
5653 } else if (c1 == '$') {
5655 if ((c1 = (*i_getc)(f)) == EOF) {
5657 } else if (('E' <= c1 && c1 <= 'G') ||
5658 ('O' <= c1 && c1 <= 'Q')) {
5666 static const nkf_char jphone_emoji_first_table[7] =
5667 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5668 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5669 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5670 while (SP <= c1 && c1 <= 'z') {
5671 (*oconv)(0, c1 + c3);
5672 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5687 } else if (c1 == LF || c1 == CR) {
5689 input_mode = ASCII; set_iconv(FALSE, 0);
5691 } else if (mime_decode_f && !mime_decode_mode){
5693 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5701 } else { /* if (c1 == CR)*/
5702 if ((c1=(*i_getc)(f))!=EOF) {
5706 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5726 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5729 if ((c3 = (*i_getc)(f)) != EOF) {
5732 if ((c4 = (*i_getc)(f)) != EOF) {
5734 (*iconv)(c2, c1, c3|c4);
5739 /* 3 bytes EUC or UTF-8 */
5740 if ((c3 = (*i_getc)(f)) != EOF) {
5742 (*iconv)(c2, c1, c3);
5750 0x7F <= c2 && c2 <= 0x92 &&
5751 0x21 <= c1 && c1 <= 0x7E) {
5753 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5756 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5760 (*oconv)(PREFIX_EUCG3 | c2, c1);
5762 #endif /* X0212_ENABLE */
5764 (*oconv)(PREFIX_EUCG3 | c2, c1);
5767 (*oconv)(input_mode, c1); /* other special case */
5773 /* goto next_word */
5777 (*iconv)(EOF, 0, 0);
5778 if (!input_codename)
5781 struct input_code *p = input_code_list;
5782 struct input_code *result = p;
5784 if (p->score < result->score) result = p;
5787 set_input_codename(result->name);
5789 debug(result->name);
5797 * int options(unsigned char *cp)
5804 options(unsigned char *cp)
5808 unsigned char *cp_back = NULL;
5813 while(*cp && *cp++!='-');
5814 while (*cp || cp_back) {
5822 case '-': /* literal options */
5823 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5827 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5828 p = (unsigned char *)long_option[i].name;
5829 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5830 if (*p == cp[j] || cp[j] == SP){
5837 #if !defined(PERL_XS) && !defined(WIN32DLL)
5838 fprintf(stderr, "unknown long option: --%s\n", cp);
5842 while(*cp && *cp != SP && cp++);
5843 if (long_option[i].alias[0]){
5845 cp = (unsigned char *)long_option[i].alias;
5848 if (strcmp(long_option[i].name, "help") == 0){
5853 if (strcmp(long_option[i].name, "ic=") == 0){
5854 enc = nkf_enc_find((char *)p);
5856 input_encoding = enc;
5859 if (strcmp(long_option[i].name, "oc=") == 0){
5860 enc = nkf_enc_find((char *)p);
5861 /* if (enc <= 0) continue; */
5863 output_encoding = enc;
5866 if (strcmp(long_option[i].name, "guess=") == 0){
5867 if (p[0] == '0' || p[0] == '1') {
5875 if (strcmp(long_option[i].name, "overwrite") == 0){
5878 preserve_time_f = TRUE;
5881 if (strcmp(long_option[i].name, "overwrite=") == 0){
5884 preserve_time_f = TRUE;
5886 backup_suffix = (char *)p;
5889 if (strcmp(long_option[i].name, "in-place") == 0){
5892 preserve_time_f = FALSE;
5895 if (strcmp(long_option[i].name, "in-place=") == 0){
5898 preserve_time_f = FALSE;
5900 backup_suffix = (char *)p;
5905 if (strcmp(long_option[i].name, "cap-input") == 0){
5909 if (strcmp(long_option[i].name, "url-input") == 0){
5914 #ifdef NUMCHAR_OPTION
5915 if (strcmp(long_option[i].name, "numchar-input") == 0){
5921 if (strcmp(long_option[i].name, "no-output") == 0){
5925 if (strcmp(long_option[i].name, "debug") == 0){
5930 if (strcmp(long_option[i].name, "cp932") == 0){
5931 #ifdef SHIFTJIS_CP932
5935 #ifdef UTF8_OUTPUT_ENABLE
5936 ms_ucs_map_f = UCS_MAP_CP932;
5940 if (strcmp(long_option[i].name, "no-cp932") == 0){
5941 #ifdef SHIFTJIS_CP932
5945 #ifdef UTF8_OUTPUT_ENABLE
5946 ms_ucs_map_f = UCS_MAP_ASCII;
5950 #ifdef SHIFTJIS_CP932
5951 if (strcmp(long_option[i].name, "cp932inv") == 0){
5958 if (strcmp(long_option[i].name, "x0212") == 0){
5965 if (strcmp(long_option[i].name, "exec-in") == 0){
5969 if (strcmp(long_option[i].name, "exec-out") == 0){
5974 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5975 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5976 no_cp932ext_f = TRUE;
5979 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5980 no_best_fit_chars_f = TRUE;
5983 if (strcmp(long_option[i].name, "fb-skip") == 0){
5984 encode_fallback = NULL;
5987 if (strcmp(long_option[i].name, "fb-html") == 0){
5988 encode_fallback = encode_fallback_html;
5991 if (strcmp(long_option[i].name, "fb-xml") == 0){
5992 encode_fallback = encode_fallback_xml;
5995 if (strcmp(long_option[i].name, "fb-java") == 0){
5996 encode_fallback = encode_fallback_java;
5999 if (strcmp(long_option[i].name, "fb-perl") == 0){
6000 encode_fallback = encode_fallback_perl;
6003 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6004 encode_fallback = encode_fallback_subchar;
6007 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6008 encode_fallback = encode_fallback_subchar;
6009 unicode_subchar = 0;
6011 /* decimal number */
6012 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6013 unicode_subchar *= 10;
6014 unicode_subchar += hex2bin(p[i]);
6016 }else if(p[1] == 'x' || p[1] == 'X'){
6017 /* hexadecimal number */
6018 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6019 unicode_subchar <<= 4;
6020 unicode_subchar |= hex2bin(p[i]);
6024 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6025 unicode_subchar *= 8;
6026 unicode_subchar += hex2bin(p[i]);
6029 w16e_conv(unicode_subchar, &i, &j);
6030 unicode_subchar = i<<8 | j;
6034 #ifdef UTF8_OUTPUT_ENABLE
6035 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6036 ms_ucs_map_f = UCS_MAP_MS;
6040 #ifdef UNICODE_NORMALIZATION
6041 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6046 if (strcmp(long_option[i].name, "prefix=") == 0){
6047 if (nkf_isgraph(p[0])){
6048 for (i = 1; nkf_isgraph(p[i]); i++){
6049 prefix_table[p[i]] = p[0];
6054 #if !defined(PERL_XS) && !defined(WIN32DLL)
6055 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6060 case 'b': /* buffered mode */
6063 case 'u': /* non bufferd mode */
6066 case 't': /* transparent mode */
6071 } else if (*cp=='2') {
6075 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6083 case 'j': /* JIS output */
6085 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6087 case 'e': /* AT&T EUC output */
6088 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6090 case 's': /* SJIS output */
6091 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6093 case 'l': /* ISO8859 Latin-1 support, no conversion */
6094 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6095 input_encoding = nkf_enc_from_index(ISO_8859_1);
6097 case 'i': /* Kanji IN ESC-$-@/B */
6098 if (*cp=='@'||*cp=='B')
6099 kanji_intro = *cp++;
6101 case 'o': /* ASCII IN ESC-(-J/B/H */
6102 /* ESC ( H was used in initial JUNET messages */
6103 if (*cp=='J'||*cp=='B'||*cp=='H')
6104 ascii_intro = *cp++;
6108 bit:1 katakana->hiragana
6109 bit:2 hiragana->katakana
6111 if ('9'>= *cp && *cp>='0')
6112 hira_f |= (*cp++ -'0');
6119 #if defined(MSDOS) || defined(__OS2__)
6126 show_configuration();
6134 #ifdef UTF8_OUTPUT_ENABLE
6135 case 'w': /* UTF-8 output */
6140 output_encoding = nkf_enc_from_index(UTF_8N);
6142 output_bom_f = TRUE;
6143 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6147 if ('1'== cp[0] && '6'==cp[1]) {
6150 } else if ('3'== cp[0] && '2'==cp[1]) {
6154 output_encoding = nkf_enc_from_index(UTF_8);
6159 output_endian = ENDIAN_LITTLE;
6160 } else if (cp[0] == 'B') {
6163 output_encoding = nkf_enc_from_index(enc_idx);
6168 enc_idx = enc_idx == UTF_16
6169 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6170 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6172 output_bom_f = TRUE;
6173 enc_idx = enc_idx == UTF_16
6174 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6175 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6177 output_encoding = nkf_enc_from_index(enc_idx);
6181 #ifdef UTF8_INPUT_ENABLE
6182 case 'W': /* UTF input */
6185 input_encoding = nkf_enc_from_index(UTF_8);
6188 if ('1'== cp[0] && '6'==cp[1]) {
6190 input_endian = ENDIAN_BIG;
6192 } else if ('3'== cp[0] && '2'==cp[1]) {
6194 input_endian = ENDIAN_BIG;
6197 input_encoding = nkf_enc_from_index(UTF_8);
6202 input_endian = ENDIAN_LITTLE;
6203 } else if (cp[0] == 'B') {
6205 input_endian = ENDIAN_BIG;
6207 enc_idx = (enc_idx == UTF_16
6208 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6209 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6210 input_encoding = nkf_enc_from_index(enc_idx);
6214 /* Input code assumption */
6215 case 'J': /* ISO-2022-JP input */
6216 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6218 case 'E': /* EUC-JP input */
6219 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6221 case 'S': /* Shift_JIS input */
6222 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6224 case 'Z': /* Convert X0208 alphabet to asii */
6226 bit:0 Convert JIS X 0208 Alphabet to ASCII
6227 bit:1 Convert Kankaku to one space
6228 bit:2 Convert Kankaku to two spaces
6229 bit:3 Convert HTML Entity
6230 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6232 while ('0'<= *cp && *cp <='9') {
6233 alpha_f |= 1 << (*cp++ - '0');
6235 if (!alpha_f) alpha_f = 1;
6237 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6238 x0201_f = FALSE; /* No X0201->X0208 conversion */
6240 ESC-(-I in JIS, EUC, MS Kanji
6241 SI/SO in JIS, EUC, MS Kanji
6242 SS2 in EUC, JIS, not in MS Kanji
6243 MS Kanji (0xa0-0xdf)
6245 ESC-(-I in JIS (0x20-0x5f)
6246 SS2 in EUC (0xa0-0xdf)
6247 0xa0-0xd in MS Kanji (0xa0-0xdf)
6250 case 'X': /* Convert X0201 kana to X0208 */
6253 case 'F': /* prserve new lines */
6254 fold_preserve_f = TRUE;
6255 case 'f': /* folding -f60 or -f */
6258 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6260 fold_len += *cp++ - '0';
6262 if (!(0<fold_len && fold_len<BUFSIZ))
6263 fold_len = DEFAULT_FOLD;
6267 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6269 fold_margin += *cp++ - '0';
6273 case 'm': /* MIME support */
6274 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6275 if (*cp=='B'||*cp=='Q') {
6276 mime_decode_mode = *cp++;
6277 mimebuf_f = FIXED_MIME;
6278 } else if (*cp=='N') {
6279 mime_f = TRUE; cp++;
6280 } else if (*cp=='S') {
6281 mime_f = STRICT_MIME; cp++;
6282 } else if (*cp=='0') {
6283 mime_decode_f = FALSE;
6284 mime_f = FALSE; cp++;
6286 mime_f = STRICT_MIME;
6289 case 'M': /* MIME output */
6292 mimeout_f = FIXED_MIME; cp++;
6293 } else if (*cp=='Q') {
6295 mimeout_f = FIXED_MIME; cp++;
6300 case 'B': /* Broken JIS support */
6302 bit:1 allow any x on ESC-(-x or ESC-$-x
6303 bit:2 reset to ascii on NL
6305 if ('9'>= *cp && *cp>='0')
6306 broken_f |= 1<<(*cp++ -'0');
6311 case 'O':/* for Output file */
6315 case 'c':/* add cr code */
6318 case 'd':/* delete cr code */
6321 case 'I': /* ISO-2022-JP output */
6324 case 'L': /* line mode */
6325 if (*cp=='u') { /* unix */
6326 eolmode_f = LF; cp++;
6327 } else if (*cp=='m') { /* mac */
6328 eolmode_f = CR; cp++;
6329 } else if (*cp=='w') { /* windows */
6330 eolmode_f = CRLF; cp++;
6331 } else if (*cp=='0') { /* no conversion */
6332 eolmode_f = 0; cp++;
6337 if ('2' <= *cp && *cp <= '9') {
6340 } else if (*cp == '0' || *cp == '1') {
6349 /* module muliple options in a string are allowed for Perl moudle */
6350 while(*cp && *cp++!='-');
6353 #if !defined(PERL_XS) && !defined(WIN32DLL)
6354 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6356 /* bogus option but ignored */
6364 #include "nkf32dll.c"
6365 #elif defined(PERL_XS)
6366 #else /* WIN32DLL */
6368 main(int argc, char **argv)
6373 char *outfname = NULL;
6376 #ifdef EASYWIN /*Easy Win */
6377 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6379 #ifdef DEFAULT_CODE_LOCALE
6380 setlocale(LC_CTYPE, "");
6384 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6385 cp = (unsigned char *)*argv;
6390 if (pipe(fds) < 0 || (pid = fork()) < 0){
6401 execvp(argv[1], &argv[1]);
6418 int debug_f_back = debug_f;
6421 int exec_f_back = exec_f;
6424 int x0212_f_back = x0212_f;
6426 int x0213_f_back = x0213_f;
6427 int guess_f_back = guess_f;
6429 guess_f = guess_f_back;
6432 debug_f = debug_f_back;
6435 exec_f = exec_f_back;
6437 x0212_f = x0212_f_back;
6438 x0213_f = x0213_f_back;
6441 if (binmode_f == TRUE)
6442 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6443 if (freopen("","wb",stdout) == NULL)
6450 setbuf(stdout, (char *) NULL);
6452 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6455 if (binmode_f == TRUE)
6456 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6457 if (freopen("","rb",stdin) == NULL) return (-1);
6461 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6465 kanji_convert(stdin);
6466 if (guess_f) print_guessed_code(NULL);
6470 int is_argument_error = FALSE;
6472 input_codename = NULL;
6475 iconv_for_check = 0;
6477 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6479 is_argument_error = TRUE;
6487 /* reopen file for stdout */
6488 if (file_out_f == TRUE) {
6491 outfname = nkf_xmalloc(strlen(origfname)
6492 + strlen(".nkftmpXXXXXX")
6494 strcpy(outfname, origfname);
6498 for (i = strlen(outfname); i; --i){
6499 if (outfname[i - 1] == '/'
6500 || outfname[i - 1] == '\\'){
6506 strcat(outfname, "ntXXXXXX");
6508 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6509 S_IREAD | S_IWRITE);
6511 strcat(outfname, ".nkftmpXXXXXX");
6512 fd = mkstemp(outfname);
6515 || (fd_backup = dup(fileno(stdout))) < 0
6516 || dup2(fd, fileno(stdout)) < 0
6527 outfname = "nkf.out";
6530 if(freopen(outfname, "w", stdout) == NULL) {
6534 if (binmode_f == TRUE) {
6535 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6536 if (freopen("","wb",stdout) == NULL)
6543 if (binmode_f == TRUE)
6544 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6545 if (freopen("","rb",fin) == NULL)
6550 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6554 char *filename = NULL;
6556 if (nfiles > 1) filename = origfname;
6557 if (guess_f) print_guessed_code(filename);
6563 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6571 if (dup2(fd_backup, fileno(stdout)) < 0){
6574 if (stat(origfname, &sb)) {
6575 fprintf(stderr, "Can't stat %s\n", origfname);
6577 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6578 if (chmod(outfname, sb.st_mode)) {
6579 fprintf(stderr, "Can't set permission %s\n", outfname);
6582 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6583 if(preserve_time_f){
6584 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6585 tb[0] = tb[1] = sb.st_mtime;
6586 if (utime(outfname, tb)) {
6587 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6590 tb.actime = sb.st_atime;
6591 tb.modtime = sb.st_mtime;
6592 if (utime(outfname, &tb)) {
6593 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6598 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6600 unlink(backup_filename);
6602 if (rename(origfname, backup_filename)) {
6603 perror(backup_filename);
6604 fprintf(stderr, "Can't rename %s to %s\n",
6605 origfname, backup_filename);
6607 nkf_xfree(backup_filename);
6610 if (unlink(origfname)){
6615 if (rename(outfname, origfname)) {
6617 fprintf(stderr, "Can't rename %s to %s\n",
6618 outfname, origfname);
6620 nkf_xfree(outfname);
6625 if (is_argument_error)
6628 #ifdef EASYWIN /*Easy Win */
6629 if (file_out_f == FALSE)
6630 scanf("%d",&end_check);
6633 #else /* for Other OS */
6634 if (file_out_f == TRUE)
6636 #endif /*Easy Win */
6639 #endif /* WIN32DLL */