2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2018, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.1.5"
24 #define NKF_RELEASE_DATE "2018-12-15"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2018, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
215 {"ISO-2022-JP", ISO_2022_JP},
216 {"ISO2022JP-CP932", CP50220},
217 {"CP50220", CP50220},
218 {"CP50221", CP50221},
219 {"CSISO2022JP", CP50221},
220 {"CP50222", CP50222},
221 {"ISO-2022-JP-1", ISO_2022_JP_1},
222 {"ISO-2022-JP-3", ISO_2022_JP_3},
223 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
224 {"SHIFT_JIS", SHIFT_JIS},
226 {"MS_Kanji", SHIFT_JIS},
228 {"WINDOWS-31J", WINDOWS_31J},
229 {"CSWINDOWS31J", WINDOWS_31J},
230 {"CP932", WINDOWS_31J},
231 {"MS932", WINDOWS_31J},
232 {"CP10001", CP10001},
235 {"EUCJP-NKF", EUCJP_NKF},
236 {"CP51932", CP51932},
237 {"EUC-JP-MS", EUCJP_MS},
238 {"EUCJP-MS", EUCJP_MS},
239 {"EUCJPMS", EUCJP_MS},
240 {"EUC-JP-ASCII", EUCJP_ASCII},
241 {"EUCJP-ASCII", EUCJP_ASCII},
242 {"SHIFT_JISX0213", SHIFT_JISX0213},
243 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
244 {"EUC-JISX0213", EUC_JISX0213},
245 {"EUC-JIS-2004", EUC_JIS_2004},
248 {"UTF-8-BOM", UTF_8_BOM},
249 {"UTF8-MAC", UTF8_MAC},
250 {"UTF-8-MAC", UTF8_MAC},
252 {"UTF-16BE", UTF_16BE},
253 {"UTF-16BE-BOM", UTF_16BE_BOM},
254 {"UTF-16LE", UTF_16LE},
255 {"UTF-16LE-BOM", UTF_16LE_BOM},
257 {"UTF-32BE", UTF_32BE},
258 {"UTF-32BE-BOM", UTF_32BE_BOM},
259 {"UTF-32LE", UTF_32LE},
260 {"UTF-32LE-BOM", UTF_32LE_BOM},
265 #if defined(DEFAULT_CODE_JIS)
266 #define DEFAULT_ENCIDX ISO_2022_JP
267 #elif defined(DEFAULT_CODE_SJIS)
268 #define DEFAULT_ENCIDX SHIFT_JIS
269 #elif defined(DEFAULT_CODE_WINDOWS_31J)
270 #define DEFAULT_ENCIDX WINDOWS_31J
271 #elif defined(DEFAULT_CODE_EUC)
272 #define DEFAULT_ENCIDX EUC_JP
273 #elif defined(DEFAULT_CODE_UTF8)
274 #define DEFAULT_ENCIDX UTF_8
278 #define is_alnum(c) \
279 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
281 /* I don't trust portablity of toupper */
282 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
283 #define nkf_isoctal(c) ('0'<=c && c<='7')
284 #define nkf_isdigit(c) ('0'<=c && c<='9')
285 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
286 #define nkf_isblank(c) (c == SP || c == TAB)
287 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
288 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
289 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
290 #define nkf_isprint(c) (SP<=c && c<='~')
291 #define nkf_isgraph(c) ('!'<=c && c<='~')
292 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
293 ('A'<=c&&c<='F') ? (c-'A'+10) : \
294 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
295 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
296 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
297 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
298 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
299 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
301 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
302 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
304 #define HOLD_SIZE 1024
305 #if defined(INT_IS_SHORT)
306 #define IOBUF_SIZE 2048
308 #define IOBUF_SIZE 16384
311 #define DEFAULT_J 'B'
312 #define DEFAULT_R 'B'
319 /* MIME preprocessor */
321 #ifdef EASYWIN /*Easy Win */
322 extern POINT _BufferSize;
331 void (*status_func)(struct input_code *, nkf_char);
332 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
336 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
337 static nkf_encoding *input_encoding = NULL;
338 static nkf_encoding *output_encoding = NULL;
340 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
342 * 0: Shift_JIS, eucJP-ascii
347 #define UCS_MAP_ASCII 0
349 #define UCS_MAP_CP932 2
350 #define UCS_MAP_CP10001 3
351 static int ms_ucs_map_f = UCS_MAP_ASCII;
353 #ifdef UTF8_INPUT_ENABLE
354 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
355 static int no_cp932ext_f = FALSE;
356 /* ignore ZERO WIDTH NO-BREAK SPACE */
357 static int no_best_fit_chars_f = FALSE;
358 static int input_endian = ENDIAN_BIG;
359 static int input_bom_f = FALSE;
360 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
361 static void (*encode_fallback)(nkf_char c) = NULL;
362 static void w_status(struct input_code *, nkf_char);
364 #ifdef UTF8_OUTPUT_ENABLE
365 static int output_bom_f = FALSE;
366 static int output_endian = ENDIAN_BIG;
369 static void std_putc(nkf_char c);
370 static nkf_char std_getc(FILE *f);
371 static nkf_char std_ungetc(nkf_char c,FILE *f);
373 static nkf_char broken_getc(FILE *f);
374 static nkf_char broken_ungetc(nkf_char c,FILE *f);
376 static nkf_char mime_getc(FILE *f);
378 static void mime_putc(nkf_char c);
382 #if !defined(PERL_XS) && !defined(WIN32DLL)
383 static unsigned char stdibuf[IOBUF_SIZE];
384 static unsigned char stdobuf[IOBUF_SIZE];
387 #define NKF_UNSPECIFIED (-TRUE)
390 static int unbuf_f = FALSE;
391 static int estab_f = FALSE;
392 static int nop_f = FALSE;
393 static int binmode_f = TRUE; /* binary mode */
394 static int rot_f = FALSE; /* rot14/43 mode */
395 static int hira_f = FALSE; /* hira/kata henkan */
396 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
397 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
398 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
399 static int mimebuf_f = FALSE; /* MIME buffered input */
400 static int broken_f = FALSE; /* convert ESC-less broken JIS */
401 static int iso8859_f = FALSE; /* ISO8859 through */
402 static int mimeout_f = FALSE; /* base64 mode */
403 static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */
404 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
406 #ifdef UNICODE_NORMALIZATION
407 static int nfc_f = FALSE;
408 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
409 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
413 static int cap_f = FALSE;
414 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
415 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
417 static int url_f = FALSE;
418 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
419 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
422 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
423 #define CLASS_MASK NKF_INT32_C(0xFF000000)
424 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
425 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
426 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
427 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
428 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
429 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
430 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
431 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
432 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
434 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
436 #ifdef NUMCHAR_OPTION
437 static int numchar_f = FALSE;
438 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
439 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
443 static int noout_f = FALSE;
444 static void no_putc(nkf_char c);
445 static int debug_f = FALSE;
446 static void debug(const char *str);
447 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
450 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
451 static void set_input_codename(const char *codename);
454 static int exec_f = 0;
457 #ifdef SHIFTJIS_CP932
458 /* invert IBM extended characters to others */
459 static int cp51932_f = FALSE;
461 /* invert NEC-selected IBM extended characters to IBM extended characters */
462 static int cp932inv_f = TRUE;
464 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
465 #endif /* SHIFTJIS_CP932 */
467 static int x0212_f = FALSE;
468 static int x0213_f = FALSE;
470 static unsigned char prefix_table[256];
472 static void e_status(struct input_code *, nkf_char);
473 static void s_status(struct input_code *, nkf_char);
475 struct input_code input_code_list[] = {
476 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
477 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
478 #ifdef UTF8_INPUT_ENABLE
479 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
480 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
481 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
483 {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0}
486 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
487 static int base64_count = 0;
489 /* X0208 -> ASCII converter */
492 static int f_line = 0; /* chars in line */
493 static int f_prev = 0;
494 static int fold_preserve_f = FALSE; /* preserve new lines */
495 static int fold_f = FALSE;
496 static int fold_len = 0;
499 static unsigned char kanji_intro = DEFAULT_J;
500 static unsigned char ascii_intro = DEFAULT_R;
504 #define FOLD_MARGIN 10
505 #define DEFAULT_FOLD 60
507 static int fold_margin = FOLD_MARGIN;
509 /* process default */
512 no_connection2(ARG_UNUSED nkf_char c2, ARG_UNUSED nkf_char c1, ARG_UNUSED nkf_char c0)
514 fprintf(stderr,"nkf internal module connection failure.\n");
520 no_connection(nkf_char c2, nkf_char c1)
522 no_connection2(c2,c1,0);
525 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
526 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
528 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
529 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
530 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
531 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
532 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
533 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
534 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
536 /* static redirections */
538 static void (*o_putc)(nkf_char c) = std_putc;
540 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
541 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
543 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
544 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
546 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
548 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
549 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
551 /* for strict mime */
552 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
553 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
556 static int output_mode = ASCII; /* output kanji mode */
557 static int input_mode = ASCII; /* input kanji mode */
558 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
560 /* X0201 / X0208 conversion tables */
562 /* X0201 kana conversion table */
564 static const unsigned char cv[]= {
565 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
566 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
567 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
568 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
569 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
570 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
571 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
572 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
573 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
574 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
575 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
576 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
577 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
578 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
579 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
580 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
584 /* X0201 kana conversion table for daguten */
586 static const unsigned char dv[]= {
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
592 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
593 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
594 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
595 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
596 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
598 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 /* X0201 kana conversion table for han-daguten */
607 static const unsigned char ev[]= {
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
618 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
619 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
626 /* X0201 kana to X0213 conversion table for han-daguten */
628 static const unsigned char ev_x0213[]= {
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
634 0x00,0x00,0x00,0x00,0x25,0x77,0x25,0x78,
635 0x25,0x79,0x25,0x7a,0x25,0x7b,0x00,0x00,
636 0x00,0x00,0x00,0x00,0x25,0x7c,0x00,0x00,
637 0x00,0x00,0x00,0x00,0x25,0x7d,0x00,0x00,
638 0x25,0x7e,0x00,0x00,0x00,0x00,0x00,0x00,
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
648 /* X0208 kigou conversion table */
649 /* 0x8140 - 0x819e */
650 static const unsigned char fv[] = {
652 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
653 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
654 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
656 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
657 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
658 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
660 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
668 static int option_mode = 0;
669 static int file_out_f = FALSE;
671 static int overwrite_f = FALSE;
672 static int preserve_time_f = FALSE;
673 static int backup_f = FALSE;
674 static char *backup_suffix = "";
677 static int eolmode_f = 0; /* CR, LF, CRLF */
678 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
679 static nkf_char prev_cr = 0; /* CR or 0 */
680 #ifdef EASYWIN /*Easy Win */
681 static int end_check;
685 nkf_xmalloc(size_t size)
689 if (size == 0) size = 1;
693 perror("can't malloc");
701 nkf_xrealloc(void *ptr, size_t size)
703 if (size == 0) size = 1;
705 ptr = realloc(ptr, size);
707 perror("can't realloc");
714 #define nkf_xfree(ptr) free(ptr)
717 nkf_str_caseeql(const char *src, const char *target)
720 for (i = 0; src[i] && target[i]; i++) {
721 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
723 if (src[i] || target[i]) return FALSE;
728 nkf_enc_from_index(int idx)
730 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
733 return &nkf_encoding_table[idx];
737 nkf_enc_find_index(const char *name)
740 if (name[0] == 'X' && *(name+1) == '-') name += 2;
741 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
742 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
743 return encoding_name_to_id_table[i].id;
750 nkf_enc_find(const char *name)
753 idx = nkf_enc_find_index(name);
754 if (idx < 0) return 0;
755 return nkf_enc_from_index(idx);
758 #define nkf_enc_name(enc) (enc)->name
759 #define nkf_enc_to_index(enc) (enc)->id
760 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
761 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
762 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
763 #define nkf_enc_asciicompat(enc) (\
764 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
765 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
766 #define nkf_enc_unicode_p(enc) (\
767 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
768 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
769 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
770 #define nkf_enc_cp5022x_p(enc) (\
771 nkf_enc_to_index(enc) == CP50220 ||\
772 nkf_enc_to_index(enc) == CP50221 ||\
773 nkf_enc_to_index(enc) == CP50222)
775 #ifdef DEFAULT_CODE_LOCALE
777 nkf_locale_charmap(void)
779 #ifdef HAVE_LANGINFO_H
780 return nl_langinfo(CODESET);
781 #elif defined(__WIN32__)
783 sprintf(buf, "CP%d", GetACP());
785 #elif defined(__OS2__)
786 # if defined(INT_IS_SHORT)
792 ULONG ulCP[1], ulncp;
793 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
794 if (ulCP[0] == 932 || ulCP[0] == 943)
795 strcpy(buf, "Shift_JIS");
797 sprintf(buf, "CP%lu", ulCP[0]);
805 nkf_locale_encoding(void)
807 nkf_encoding *enc = 0;
808 const char *encname = nkf_locale_charmap();
810 enc = nkf_enc_find(encname);
813 #endif /* DEFAULT_CODE_LOCALE */
816 nkf_utf8_encoding(void)
818 return &nkf_encoding_table[UTF_8];
822 nkf_default_encoding(void)
824 nkf_encoding *enc = 0;
825 #ifdef DEFAULT_CODE_LOCALE
826 enc = nkf_locale_encoding();
827 #elif defined(DEFAULT_ENCIDX)
828 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
830 if (!enc) enc = nkf_utf8_encoding();
841 nkf_buf_new(int length)
843 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
844 buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length);
852 nkf_buf_dispose(nkf_buf_t *buf)
859 #define nkf_buf_length(buf) ((buf)->len)
860 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
863 nkf_buf_at(nkf_buf_t *buf, int index)
865 assert(index <= buf->len);
866 return buf->ptr[index];
870 nkf_buf_clear(nkf_buf_t *buf)
876 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
878 if (buf->capa <= buf->len) {
881 buf->ptr[buf->len++] = c;
885 nkf_buf_pop(nkf_buf_t *buf)
887 assert(!nkf_buf_empty_p(buf));
888 return buf->ptr[--buf->len];
891 /* Normalization Form C */
894 #define fprintf dllprintf
900 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
907 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
908 #ifdef UTF8_OUTPUT_ENABLE
909 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
910 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
913 #ifdef UTF8_INPUT_ENABLE
914 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
915 " UTF option is -W[8,[16,32][B,L]]\n"
917 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
921 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
922 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
923 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
926 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
927 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
928 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
929 " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n"
932 " O Output to File (DEFAULT 'nkf.out')\n"
933 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
936 " --ic=<encoding> Specify the input encoding\n"
937 " --oc=<encoding> Specify the output encoding\n"
938 " --hiragana --katakana Hiragana/Katakana Conversion\n"
939 " --katakana-hiragana Converts each other\n"
943 " --{cap, url}-input Convert hex after ':' or '%%'\n"
945 #ifdef NUMCHAR_OPTION
946 " --numchar-input Convert Unicode Character Reference\n"
948 #ifdef UTF8_INPUT_ENABLE
949 " --fb-{skip, html, xml, perl, java, subchar}\n"
950 " Specify unassigned character's replacement\n"
955 " --in-place[=SUF] Overwrite original files\n"
956 " --overwrite[=SUF] Preserve timestamp of original files\n"
958 " -g --guess Guess the input code\n"
959 " -v --version Print the version\n"
960 " --help/-V Print this help / configuration\n"
966 show_configuration(void)
969 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
970 " Compile-time options:\n"
971 " Compiled at: " __DATE__ " " __TIME__ "\n"
974 " Default output encoding: "
975 #ifdef DEFAULT_CODE_LOCALE
976 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
977 #elif defined(DEFAULT_ENCIDX)
978 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
984 " Default output end of line: "
985 #if DEFAULT_NEWLINE == CR
987 #elif DEFAULT_NEWLINE == CRLF
993 " Decode MIME encoded string: "
994 #if MIME_DECODE_DEFAULT
1000 " Convert JIS X 0201 Katakana: "
1007 " --help, --version output: "
1008 #if HELP_OUTPUT_HELP_OUTPUT
1019 get_backup_filename(const char *suffix, const char *filename)
1021 char *backup_filename;
1022 int asterisk_count = 0;
1024 int filename_length = strlen(filename);
1026 for(i = 0; suffix[i]; i++){
1027 if(suffix[i] == '*') asterisk_count++;
1031 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1032 for(i = 0, j = 0; suffix[i];){
1033 if(suffix[i] == '*'){
1034 backup_filename[j] = '\0';
1035 strncat(backup_filename, filename, filename_length);
1037 j += filename_length;
1039 backup_filename[j++] = suffix[i++];
1042 backup_filename[j] = '\0';
1044 j = filename_length + strlen(suffix);
1045 backup_filename = nkf_xmalloc(j + 1);
1046 strcpy(backup_filename, filename);
1047 strcat(backup_filename, suffix);
1048 backup_filename[j] = '\0';
1050 return backup_filename;
1054 #ifdef UTF8_INPUT_ENABLE
1056 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1061 if(c >= NKF_INT32_C(1)<<shift){
1063 (*f)(0, bin2hex(c>>shift));
1074 encode_fallback_html(nkf_char c)
1079 if(c >= NKF_INT32_C(1000000))
1080 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1081 if(c >= NKF_INT32_C(100000))
1082 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1084 (*oconv)(0, 0x30+(c/10000 )%10);
1086 (*oconv)(0, 0x30+(c/1000 )%10);
1088 (*oconv)(0, 0x30+(c/100 )%10);
1090 (*oconv)(0, 0x30+(c/10 )%10);
1092 (*oconv)(0, 0x30+ c %10);
1098 encode_fallback_xml(nkf_char c)
1103 nkf_each_char_to_hex(oconv, c);
1109 encode_fallback_java(nkf_char c)
1113 if(!nkf_char_unicode_bmp_p(c)){
1114 int high = (c >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
1115 int low = (c & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
1117 (*oconv)(0, bin2hex(high>>12));
1118 (*oconv)(0, bin2hex(high>> 8));
1119 (*oconv)(0, bin2hex(high>> 4));
1120 (*oconv)(0, bin2hex(high ));
1123 (*oconv)(0, bin2hex(low>>12));
1124 (*oconv)(0, bin2hex(low>> 8));
1125 (*oconv)(0, bin2hex(low>> 4));
1126 (*oconv)(0, bin2hex(low ));
1129 (*oconv)(0, bin2hex(c>>12));
1130 (*oconv)(0, bin2hex(c>> 8));
1131 (*oconv)(0, bin2hex(c>> 4));
1132 (*oconv)(0, bin2hex(c ));
1138 encode_fallback_perl(nkf_char c)
1143 nkf_each_char_to_hex(oconv, c);
1149 encode_fallback_subchar(nkf_char c)
1151 c = unicode_subchar;
1152 (*oconv)((c>>8)&0xFF, c&0xFF);
1157 static const struct {
1181 {"katakana-hiragana","h3"},
1189 #ifdef UTF8_OUTPUT_ENABLE
1199 {"fb-subchar=", ""},
1201 #ifdef UTF8_INPUT_ENABLE
1202 {"utf8-input", "W"},
1203 {"utf16-input", "W16"},
1204 {"no-cp932ext", ""},
1205 {"no-best-fit-chars",""},
1207 #ifdef UNICODE_NORMALIZATION
1208 {"utf8mac-input", ""},
1220 #ifdef NUMCHAR_OPTION
1221 {"numchar-input", ""},
1227 #ifdef SHIFTJIS_CP932
1238 set_input_encoding(nkf_encoding *enc)
1240 switch (nkf_enc_to_index(enc)) {
1246 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1248 #ifdef SHIFTJIS_CP932
1251 #ifdef UTF8_OUTPUT_ENABLE
1252 ms_ucs_map_f = UCS_MAP_CP932;
1262 case ISO_2022_JP_2004:
1269 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1270 #ifdef SHIFTJIS_CP932
1273 #ifdef UTF8_OUTPUT_ENABLE
1274 ms_ucs_map_f = UCS_MAP_CP932;
1279 #ifdef SHIFTJIS_CP932
1282 #ifdef UTF8_OUTPUT_ENABLE
1283 ms_ucs_map_f = UCS_MAP_CP10001;
1291 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1292 #ifdef SHIFTJIS_CP932
1295 #ifdef UTF8_OUTPUT_ENABLE
1296 ms_ucs_map_f = UCS_MAP_CP932;
1300 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1301 #ifdef SHIFTJIS_CP932
1304 #ifdef UTF8_OUTPUT_ENABLE
1305 ms_ucs_map_f = UCS_MAP_MS;
1309 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1310 #ifdef SHIFTJIS_CP932
1313 #ifdef UTF8_OUTPUT_ENABLE
1314 ms_ucs_map_f = UCS_MAP_ASCII;
1317 case SHIFT_JISX0213:
1318 case SHIFT_JIS_2004:
1320 #ifdef SHIFTJIS_CP932
1322 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1328 #ifdef SHIFTJIS_CP932
1332 #ifdef UTF8_INPUT_ENABLE
1333 #ifdef UNICODE_NORMALIZATION
1341 input_endian = ENDIAN_BIG;
1345 input_endian = ENDIAN_LITTLE;
1350 input_endian = ENDIAN_BIG;
1354 input_endian = ENDIAN_LITTLE;
1361 set_output_encoding(nkf_encoding *enc)
1363 switch (nkf_enc_to_index(enc)) {
1365 #ifdef SHIFTJIS_CP932
1366 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1368 #ifdef UTF8_OUTPUT_ENABLE
1369 ms_ucs_map_f = UCS_MAP_CP932;
1373 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1374 #ifdef SHIFTJIS_CP932
1375 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1377 #ifdef UTF8_OUTPUT_ENABLE
1378 ms_ucs_map_f = UCS_MAP_CP932;
1382 #ifdef SHIFTJIS_CP932
1383 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1388 #ifdef SHIFTJIS_CP932
1389 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1393 case ISO_2022_JP_2004:
1396 #ifdef SHIFTJIS_CP932
1397 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1403 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1404 #ifdef UTF8_OUTPUT_ENABLE
1405 ms_ucs_map_f = UCS_MAP_CP932;
1409 #ifdef UTF8_OUTPUT_ENABLE
1410 ms_ucs_map_f = UCS_MAP_CP10001;
1415 #ifdef SHIFTJIS_CP932
1416 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1418 #ifdef UTF8_OUTPUT_ENABLE
1419 ms_ucs_map_f = UCS_MAP_ASCII;
1424 #ifdef SHIFTJIS_CP932
1425 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1427 #ifdef UTF8_OUTPUT_ENABLE
1428 ms_ucs_map_f = UCS_MAP_ASCII;
1432 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1433 #ifdef SHIFTJIS_CP932
1434 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1436 #ifdef UTF8_OUTPUT_ENABLE
1437 ms_ucs_map_f = UCS_MAP_CP932;
1441 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1443 #ifdef UTF8_OUTPUT_ENABLE
1444 ms_ucs_map_f = UCS_MAP_MS;
1448 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1450 #ifdef UTF8_OUTPUT_ENABLE
1451 ms_ucs_map_f = UCS_MAP_ASCII;
1454 case SHIFT_JISX0213:
1455 case SHIFT_JIS_2004:
1457 #ifdef SHIFTJIS_CP932
1458 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1465 #ifdef SHIFTJIS_CP932
1466 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1469 #ifdef UTF8_OUTPUT_ENABLE
1471 output_bom_f = TRUE;
1475 output_bom_f = TRUE;
1478 output_endian = ENDIAN_LITTLE;
1479 output_bom_f = FALSE;
1482 output_endian = ENDIAN_LITTLE;
1483 output_bom_f = TRUE;
1487 output_bom_f = TRUE;
1490 output_endian = ENDIAN_LITTLE;
1491 output_bom_f = FALSE;
1494 output_endian = ENDIAN_LITTLE;
1495 output_bom_f = TRUE;
1501 static struct input_code*
1502 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1505 struct input_code *p = input_code_list;
1507 if (iconv_func == p->iconv_func){
1517 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1519 #ifdef INPUT_CODE_FIX
1520 if (f || !input_encoding)
1527 #ifdef INPUT_CODE_FIX
1528 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1534 if (estab_f && iconv_for_check != iconv){
1535 struct input_code *p = find_inputcode_byfunc(iconv);
1537 set_input_codename(p->name);
1540 iconv_for_check = iconv;
1547 x0212_shift(nkf_char c)
1552 if (0x75 <= c && c <= 0x7f){
1553 ret = c + (0x109 - 0x75);
1556 if (0x75 <= c && c <= 0x7f){
1557 ret = c + (0x113 - 0x75);
1565 x0212_unshift(nkf_char c)
1568 if (0x7f <= c && c <= 0x88){
1569 ret = c + (0x75 - 0x7f);
1570 }else if (0x89 <= c && c <= 0x92){
1571 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1575 #endif /* X0212_ENABLE */
1578 is_x0213_2_in_x0212(nkf_char c1)
1580 static const char x0213_2_table[] =
1581 {0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1};
1584 return x0213_2_table[ku]; /* 1, 3-5, 8, 12-15 */
1585 if (78 <= ku && ku <= 94)
1591 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1596 if (x0213_f && is_x0213_2_in_x0212(ndx)){
1597 if((0x21 <= ndx && ndx <= 0x2F)){
1598 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1599 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1601 }else if(0x6E <= ndx && ndx <= 0x7E){
1602 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1603 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1609 else if(nkf_isgraph(ndx)){
1611 const unsigned short *ptr;
1612 ptr = x0212_shiftjis[ndx - 0x21];
1614 val = ptr[(c1 & 0x7f) - 0x21];
1623 c2 = x0212_shift(c2);
1625 #endif /* X0212_ENABLE */
1627 if(0x7F < c2) return 1;
1628 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1629 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1634 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1636 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1639 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1640 if (0xFC < c1) return 1;
1641 #ifdef SHIFTJIS_CP932
1642 if (!cp932inv_f && !x0213_f && is_ibmext_in_sjis(c2)){
1643 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1650 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1651 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1657 #endif /* SHIFTJIS_CP932 */
1659 if (!x0213_f && is_ibmext_in_sjis(c2)){
1660 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1663 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1676 if(x0213_f && c2 >= 0xF0){
1677 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1678 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1679 }else{ /* 78<=k<=94 */
1680 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1681 if (0x9E < c1) c2++;
1684 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1685 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1686 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1687 if (0x9E < c1) c2++;
1690 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1697 c2 = x0212_unshift(c2);
1704 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1706 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1714 }else if (val < 0x800){
1715 *p1 = 0xc0 | (val >> 6);
1716 *p2 = 0x80 | (val & 0x3f);
1719 } else if (nkf_char_unicode_bmp_p(val)) {
1720 *p1 = 0xe0 | (val >> 12);
1721 *p2 = 0x80 | ((val >> 6) & 0x3f);
1722 *p3 = 0x80 | ( val & 0x3f);
1724 } else if (nkf_char_unicode_value_p(val)) {
1725 *p1 = 0xf0 | (val >> 18);
1726 *p2 = 0x80 | ((val >> 12) & 0x3f);
1727 *p3 = 0x80 | ((val >> 6) & 0x3f);
1728 *p4 = 0x80 | ( val & 0x3f);
1738 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1745 else if (c1 <= 0xC1) {
1746 /* trail byte or invalid */
1749 else if (c1 <= 0xDF) {
1751 wc = (c1 & 0x1F) << 6;
1754 else if (c1 <= 0xEF) {
1756 wc = (c1 & 0x0F) << 12;
1757 wc |= (c2 & 0x3F) << 6;
1760 else if (c2 <= 0xF4) {
1762 wc = (c1 & 0x0F) << 18;
1763 wc |= (c2 & 0x3F) << 12;
1764 wc |= (c3 & 0x3F) << 6;
1774 #ifdef UTF8_INPUT_ENABLE
1776 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1777 const unsigned short *const *pp, nkf_char psize,
1778 nkf_char *p2, nkf_char *p1)
1781 const unsigned short *p;
1784 if (pp == 0) return 1;
1787 if (c1 < 0 || psize <= c1) return 1;
1789 if (p == 0) return 1;
1792 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1794 if (val == 0) return 1;
1795 if (no_cp932ext_f && (
1796 (val>>8) == 0x2D || /* NEC special characters */
1797 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1805 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1813 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1815 const unsigned short *const *pp;
1816 const unsigned short *const *const *ppp;
1817 static const char no_best_fit_chars_table_C2[] =
1818 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1819 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1820 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1821 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1822 static const char no_best_fit_chars_table_C2_ms[] =
1823 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1824 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1825 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1826 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1827 static const char no_best_fit_chars_table_932_C2[] =
1828 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1829 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1830 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1831 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1832 static const char no_best_fit_chars_table_932_C3[] =
1833 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1834 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1835 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1836 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1842 }else if(c2 < 0xe0){
1843 if(no_best_fit_chars_f){
1844 if(ms_ucs_map_f == UCS_MAP_CP932){
1847 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1850 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1853 }else if(!cp932inv_f){
1856 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1859 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1862 }else if(ms_ucs_map_f == UCS_MAP_MS){
1863 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1864 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1882 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1883 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1884 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1885 x0213_f ? utf8_to_euc_2bytes_x0213 :
1887 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1888 }else if(c0 < 0xF0){
1889 if(no_best_fit_chars_f){
1890 if(ms_ucs_map_f == UCS_MAP_CP932){
1891 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1892 }else if(ms_ucs_map_f == UCS_MAP_MS){
1897 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1900 if(c0 == 0x92) return 1;
1905 if(c1 == 0x80 || c0 == 0x9C) return 1;
1908 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1913 if(c0 == 0x94) return 1;
1916 if(c0 == 0xBB) return 1;
1926 if(c0 == 0x95) return 1;
1929 if(c0 == 0xA5) return 1;
1936 if(c0 == 0x8D) return 1;
1939 if(c0 == 0x9E && !cp932inv_f) return 1;
1942 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1950 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1951 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1952 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1953 x0213_f ? utf8_to_euc_3bytes_x0213 :
1955 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1957 #ifdef SHIFTJIS_CP932
1958 if (!ret&& is_eucg3(*p2)) {
1960 if (encode_fallback) ret = 1;
1964 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1965 s2e_conv(s2, s1, p2, p1);
1975 #ifdef UTF8_OUTPUT_ENABLE
1976 #define X0213_SURROGATE_FIND(tbl, size, euc) do { \
1978 for (i = 0; i < size; i++) \
1979 if (tbl[i][0] == euc) { \
1986 e2w_conv(nkf_char c2, nkf_char c1)
1988 const unsigned short *p;
1990 if (c2 == JIS_X_0201_1976_K) {
1991 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1999 p = euc_to_utf8_1byte;
2001 } else if (is_eucg3(c2)){
2002 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
2005 c2 = (c2&0x7f) - 0x21;
2006 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2008 x0213_f ? x0212_to_utf8_2bytes_x0213[c2] :
2009 x0212_to_utf8_2bytes[c2];
2015 c2 = (c2&0x7f) - 0x21;
2016 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2018 x0213_f ? euc_to_utf8_2bytes_x0213[c2] :
2019 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
2020 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
2021 euc_to_utf8_2bytes_ms[c2];
2026 c1 = (c1 & 0x7f) - 0x21;
2027 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte) {
2028 nkf_char val = p[c1];
2029 if (x0213_f && 0xD800<=val && val<=0xDBFF) {
2030 nkf_char euc = (c2+0x21)<<8 | (c1+0x21);
2032 if (p==x0212_to_utf8_2bytes_x0213[c2]) {
2033 X0213_SURROGATE_FIND(x0213_2_surrogate_table, sizeof_x0213_2_surrogate_table, euc);
2035 X0213_SURROGATE_FIND(x0213_1_surrogate_table, sizeof_x0213_1_surrogate_table, euc);
2038 return UTF16_TO_UTF32(val, low);
2047 e2w_combining(nkf_char comb, nkf_char c2, nkf_char c1)
2051 for (i = 0; i < sizeof_x0213_combining_chars; i++)
2052 if (x0213_combining_chars[i] == comb)
2054 if (i >= sizeof_x0213_combining_chars)
2056 euc = (c2&0x7f)<<8 | (c1&0x7f);
2057 for (i = 0; i < sizeof_x0213_combining_table; i++)
2058 if (x0213_combining_table[i][0] == euc)
2059 return x0213_combining_table[i][1];
2065 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
2072 }else if (0xc0 <= c2 && c2 <= 0xef) {
2073 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2074 #ifdef NUMCHAR_OPTION
2077 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
2085 #ifdef UTF8_INPUT_ENABLE
2087 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
2089 nkf_char c1, c2, c3, c4;
2096 else if (nkf_char_unicode_bmp_p(val)){
2097 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2098 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
2101 *p1 = nkf_char_unicode_new(val);
2108 c1 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2109 c2 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2110 for (i = 0; i < sizeof_x0213_1_surrogate_table; i++)
2111 if (x0213_1_surrogate_table[i][1] == c1 && x0213_1_surrogate_table[i][2] == c2) {
2112 val = x0213_1_surrogate_table[i][0];
2117 for (i = 0; i < sizeof_x0213_2_surrogate_table; i++)
2118 if (x0213_2_surrogate_table[i][1] == c1 && x0213_2_surrogate_table[i][2] == c2) {
2119 val = x0213_2_surrogate_table[i][0];
2120 *p2 = PREFIX_EUCG3 | (val >> 8);
2126 *p1 = nkf_char_unicode_new(val);
2133 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2135 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2136 if (iso2022jp_f && !x0201_f) {
2137 c2 = GETA1; c1 = GETA2;
2139 c2 = JIS_X_0201_1976_K;
2143 }else if (c2 == 0x8f){
2147 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2148 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2149 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2152 c2 = (c2 << 8) | (c1 & 0x7f);
2154 #ifdef SHIFTJIS_CP932
2157 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2158 s2e_conv(s2, s1, &c2, &c1);
2165 #endif /* SHIFTJIS_CP932 */
2167 #endif /* X0212_ENABLE */
2168 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2171 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2172 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2173 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2178 #ifdef SHIFTJIS_CP932
2179 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2181 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2182 s2e_conv(s2, s1, &c2, &c1);
2189 #endif /* SHIFTJIS_CP932 */
2197 s_iconv(ARG_UNUSED nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2199 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2200 if (iso2022jp_f && !x0201_f) {
2201 c2 = GETA1; c1 = GETA2;
2205 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2207 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2209 if(c1 == 0x7F) return 0;
2210 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2213 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2214 if (ret) return ret;
2221 x0213_wait_combining_p(nkf_char wc)
2224 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2225 if (x0213_combining_table[i][1] == wc) {
2233 x0213_combining_p(nkf_char wc)
2236 for (i = 0; i < sizeof_x0213_combining_chars; i++) {
2237 if (x0213_combining_chars[i] == wc) {
2245 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2247 nkf_char ret = 0, c4 = 0;
2248 static const char w_iconv_utf8_1st_byte[] =
2250 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2251 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2252 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2253 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2260 if (c1 < 0 || 0xff < c1) {
2261 }else if (c1 == 0) { /* 0 : 1 byte*/
2263 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2266 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2268 if (c2 < 0x80 || 0xBF < c2) return 0;
2271 if (c3 == 0) return -1;
2272 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2277 if (c3 == 0) return -1;
2278 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2282 if (c3 == 0) return -1;
2283 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2287 if (c3 == 0) return -2;
2288 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2292 if (c3 == 0) return -2;
2293 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2297 if (c3 == 0) return -2;
2298 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2306 if (c1 == 0 || c1 == EOF){
2307 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2308 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2311 if (x0213_f && x0213_wait_combining_p(nkf_utf8_to_unicode(c1, c2, c3, c4)))
2313 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2322 w_iconv_nocombine(nkf_char c1, nkf_char c2, nkf_char c3)
2324 /* continue from the line below 'return -3;' in w_iconv() */
2325 nkf_char ret = w2e_conv(c1, c2, c3, &c1, &c2);
2332 #define NKF_ICONV_INVALID_CODE_RANGE -13
2333 #define NKF_ICONV_WAIT_COMBINING_CHAR -14
2334 #define NKF_ICONV_NOT_COMBINED -15
2336 unicode_iconv(nkf_char wc, int nocombine)
2344 }else if ((wc>>11) == 27) {
2345 /* unpaired surrogate */
2346 return NKF_ICONV_INVALID_CODE_RANGE;
2347 }else if (wc < 0xFFFF) {
2348 if (!nocombine && x0213_f && x0213_wait_combining_p(wc))
2349 return NKF_ICONV_WAIT_COMBINING_CHAR;
2350 ret = w16e_conv(wc, &c2, &c1);
2351 if (ret) return ret;
2352 }else if (wc < 0x10FFFF) {
2354 c1 = nkf_char_unicode_new(wc);
2356 return NKF_ICONV_INVALID_CODE_RANGE;
2363 unicode_iconv_combine(nkf_char wc, nkf_char wc2)
2369 return NKF_ICONV_NOT_COMBINED;
2370 }else if ((wc2>>11) == 27) {
2371 /* unpaired surrogate */
2372 return NKF_ICONV_INVALID_CODE_RANGE;
2373 }else if (wc2 < 0xFFFF) {
2374 if (!x0213_combining_p(wc2))
2375 return NKF_ICONV_NOT_COMBINED;
2376 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2377 if (x0213_combining_table[i][1] == wc &&
2378 x0213_combining_table[i][2] == wc2) {
2379 c2 = x0213_combining_table[i][0] >> 8;
2380 c1 = x0213_combining_table[i][0] & 0x7f;
2385 }else if (wc2 < 0x10FFFF) {
2386 return NKF_ICONV_NOT_COMBINED;
2388 return NKF_ICONV_INVALID_CODE_RANGE;
2390 return NKF_ICONV_NOT_COMBINED;
2394 w_iconv_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6)
2397 wc = nkf_utf8_to_unicode(c1, c2, c3, 0);
2398 wc2 = nkf_utf8_to_unicode(c4, c5, c6, 0);
2401 return unicode_iconv_combine(wc, wc2);
2404 #define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1
2405 #define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2
2407 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2416 if (input_endian == ENDIAN_BIG) {
2417 if (0xD8 <= c1 && c1 <= 0xDB) {
2418 if (0xDC <= c3 && c3 <= 0xDF) {
2419 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2420 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2425 if (0xD8 <= c2 && c2 <= 0xDB) {
2426 if (0xDC <= c4 && c4 <= 0xDF) {
2427 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2428 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2434 return (*unicode_iconv)(wc, FALSE);
2438 nkf_iconv_utf_16_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2442 if (input_endian == ENDIAN_BIG) {
2443 if (0xD8 <= c3 && c3 <= 0xDB) {
2444 return NKF_ICONV_NOT_COMBINED;
2450 if (0xD8 <= c2 && c2 <= 0xDB) {
2451 return NKF_ICONV_NOT_COMBINED;
2458 return unicode_iconv_combine(wc, wc2);
2462 nkf_iconv_utf_16_nocombine(nkf_char c1, nkf_char c2)
2465 if (input_endian == ENDIAN_BIG)
2469 return (*unicode_iconv)(wc, TRUE);
2473 w_iconv16(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2476 return 16; /* different from w_iconv32 */
2480 w_iconv32(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2483 return 32; /* different from w_iconv16 */
2487 utf32_to_nkf_char(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2491 switch(input_endian){
2493 wc = c2 << 16 | c3 << 8 | c4;
2496 wc = c3 << 16 | c2 << 8 | c1;
2499 wc = c1 << 16 | c4 << 8 | c3;
2502 wc = c4 << 16 | c1 << 8 | c2;
2505 return NKF_ICONV_INVALID_CODE_RANGE;
2511 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2520 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2524 return (*unicode_iconv)(wc, FALSE);
2528 nkf_iconv_utf_32_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6, nkf_char c7, nkf_char c8)
2532 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2535 wc2 = utf32_to_nkf_char(c5, c6, c7, c8);
2539 return unicode_iconv_combine(wc, wc2);
2543 nkf_iconv_utf_32_nocombine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2547 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2548 return (*unicode_iconv)(wc, TRUE);
2552 #define output_ascii_escape_sequence(mode) do { \
2553 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2556 (*o_putc)(ascii_intro); \
2557 output_mode = mode; \
2562 output_escape_sequence(int mode)
2564 if (output_mode == mode)
2572 case JIS_X_0201_1976_K:
2580 (*o_putc)(kanji_intro);
2605 j_oconv(nkf_char c2, nkf_char c1)
2607 #ifdef NUMCHAR_OPTION
2608 if (c2 == 0 && nkf_char_unicode_p(c1)){
2609 w16e_conv(c1, &c2, &c1);
2610 if (c2 == 0 && nkf_char_unicode_p(c1)){
2611 c2 = c1 & VALUE_MASK;
2612 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2615 c2 = 0x7F + c1 / 94;
2616 c1 = 0x21 + c1 % 94;
2618 if (encode_fallback) (*encode_fallback)(c1);
2625 output_ascii_escape_sequence(ASCII);
2628 else if (c2 == EOF) {
2629 output_ascii_escape_sequence(ASCII);
2632 else if (c2 == ISO_8859_1) {
2633 output_ascii_escape_sequence(ISO_8859_1);
2636 else if (c2 == JIS_X_0201_1976_K) {
2637 output_escape_sequence(JIS_X_0201_1976_K);
2640 } else if (is_eucg3(c2)){
2641 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2642 (*o_putc)(c2 & 0x7f);
2647 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2648 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2649 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2656 e_oconv(nkf_char c2, nkf_char c1)
2658 if (c2 == 0 && nkf_char_unicode_p(c1)){
2659 w16e_conv(c1, &c2, &c1);
2660 if (c2 == 0 && nkf_char_unicode_p(c1)){
2661 c2 = c1 & VALUE_MASK;
2662 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2666 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2667 c1 = 0x21 + c1 % 94;
2670 (*o_putc)((c2 & 0x7f) | 0x080);
2671 (*o_putc)(c1 | 0x080);
2673 (*o_putc)((c2 & 0x7f) | 0x080);
2674 (*o_putc)(c1 | 0x080);
2678 if (encode_fallback) (*encode_fallback)(c1);
2686 } else if (c2 == 0) {
2687 output_mode = ASCII;
2689 } else if (c2 == JIS_X_0201_1976_K) {
2690 output_mode = EUC_JP;
2691 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2692 } else if (c2 == ISO_8859_1) {
2693 output_mode = ISO_8859_1;
2694 (*o_putc)(c1 | 0x080);
2696 } else if (is_eucg3(c2)){
2697 output_mode = EUC_JP;
2698 #ifdef SHIFTJIS_CP932
2701 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2702 s2e_conv(s2, s1, &c2, &c1);
2707 output_mode = ASCII;
2709 }else if (is_eucg3(c2)){
2712 (*o_putc)((c2 & 0x7f) | 0x080);
2713 (*o_putc)(c1 | 0x080);
2716 (*o_putc)((c2 & 0x7f) | 0x080);
2717 (*o_putc)(c1 | 0x080);
2721 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2722 set_iconv(FALSE, 0);
2723 return; /* too late to rescue this char */
2725 output_mode = EUC_JP;
2726 (*o_putc)(c2 | 0x080);
2727 (*o_putc)(c1 | 0x080);
2732 s_oconv(nkf_char c2, nkf_char c1)
2734 #ifdef NUMCHAR_OPTION
2735 if (c2 == 0 && nkf_char_unicode_p(c1)){
2736 w16e_conv(c1, &c2, &c1);
2737 if (c2 == 0 && nkf_char_unicode_p(c1)){
2738 c2 = c1 & VALUE_MASK;
2739 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2742 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2744 c1 += 0x40 + (c1 > 0x3e);
2749 if(encode_fallback)(*encode_fallback)(c1);
2758 } else if (c2 == 0) {
2759 output_mode = ASCII;
2761 } else if (c2 == JIS_X_0201_1976_K) {
2762 output_mode = SHIFT_JIS;
2764 } else if (c2 == ISO_8859_1) {
2765 output_mode = ISO_8859_1;
2766 (*o_putc)(c1 | 0x080);
2768 } else if (is_eucg3(c2)){
2769 output_mode = SHIFT_JIS;
2770 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2776 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2777 set_iconv(FALSE, 0);
2778 return; /* too late to rescue this char */
2780 output_mode = SHIFT_JIS;
2781 e2s_conv(c2, c1, &c2, &c1);
2783 #ifdef SHIFTJIS_CP932
2785 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2786 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2792 #endif /* SHIFTJIS_CP932 */
2795 if (prefix_table[(unsigned char)c1]){
2796 (*o_putc)(prefix_table[(unsigned char)c1]);
2802 #ifdef UTF8_OUTPUT_ENABLE
2803 #define OUTPUT_UTF8(val) do { \
2804 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); \
2806 if (c2) (*o_putc)(c2); \
2807 if (c3) (*o_putc)(c3); \
2808 if (c4) (*o_putc)(c4); \
2812 w_oconv(nkf_char c2, nkf_char c1)
2818 output_bom_f = FALSE;
2829 if (c2 == 0 && nkf_char_unicode_p(c1)){
2830 val = c1 & VALUE_MASK;
2838 val = e2w_conv(c2, c1);
2840 val2 = e2w_combining(val, c2, c1);
2848 #define OUTPUT_UTF16_BYTES(c1, c2) do { \
2849 if (output_endian == ENDIAN_LITTLE){ \
2858 #define OUTPUT_UTF16(val) do { \
2859 if (nkf_char_unicode_bmp_p(val)) { \
2860 c2 = (val >> 8) & 0xff; \
2862 OUTPUT_UTF16_BYTES(c1, c2); \
2864 val &= VALUE_MASK; \
2865 if (val <= UNICODE_MAX) { \
2866 c2 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ \
2867 c1 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ \
2868 OUTPUT_UTF16_BYTES(c2 & 0xff, (c2 >> 8) & 0xff); \
2869 OUTPUT_UTF16_BYTES(c1 & 0xff, (c1 >> 8) & 0xff); \
2875 w_oconv16(nkf_char c2, nkf_char c1)
2878 output_bom_f = FALSE;
2879 OUTPUT_UTF16_BYTES(0xFF, 0xFE);
2887 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2891 val = e2w_conv(c2, c1);
2893 val2 = e2w_combining(val, c2, c1);
2898 OUTPUT_UTF16_BYTES(c1, c2);
2902 #define OUTPUT_UTF32(c) do { \
2903 if (output_endian == ENDIAN_LITTLE){ \
2904 (*o_putc)( (c) & 0xFF); \
2905 (*o_putc)(((c) >> 8) & 0xFF); \
2906 (*o_putc)(((c) >> 16) & 0xFF); \
2910 (*o_putc)(((c) >> 16) & 0xFF); \
2911 (*o_putc)(((c) >> 8) & 0xFF); \
2912 (*o_putc)( (c) & 0xFF); \
2917 w_oconv32(nkf_char c2, nkf_char c1)
2920 output_bom_f = FALSE;
2921 if (output_endian == ENDIAN_LITTLE){
2939 if (c2 == ISO_8859_1) {
2941 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2945 val = e2w_conv(c2, c1);
2947 val2 = e2w_combining(val, c2, c1);
2956 #define SCORE_L2 (1) /* Kanji Level 2 */
2957 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2958 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2959 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2960 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2961 #define SCORE_X0213 (SCORE_X0212 << 1) /* JIS X 0213 */
2962 #define SCORE_NO_EXIST (SCORE_X0213 << 1) /* Undefined Characters */
2963 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2964 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2966 #define SCORE_INIT (SCORE_iMIME)
2968 static const nkf_char score_table_A0[] = {
2971 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2972 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_X0213,
2975 static const nkf_char score_table_F0[] = {
2976 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2977 SCORE_L2, SCORE_DEPEND, SCORE_X0213, SCORE_X0213,
2978 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2979 SCORE_CP932, SCORE_X0213, SCORE_X0213, SCORE_ERROR,
2982 static const nkf_char score_table_8FA0[] = {
2983 0, SCORE_X0213, SCORE_X0212, SCORE_X0213,
2984 SCORE_X0213, SCORE_X0213, SCORE_X0212, SCORE_X0212,
2985 SCORE_X0213, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2986 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2989 static const nkf_char score_table_8FE0[] = {
2990 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2991 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2992 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2993 SCORE_X0212, SCORE_X0212, SCORE_X0213, SCORE_X0213,
2996 static const nkf_char score_table_8FF0[] = {
2997 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0212,
2998 SCORE_X0212, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2999 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
3000 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
3004 set_code_score(struct input_code *ptr, nkf_char score)
3007 ptr->score |= score;
3012 clr_code_score(struct input_code *ptr, nkf_char score)
3015 ptr->score &= ~score;
3020 code_score(struct input_code *ptr)
3022 nkf_char c2 = ptr->buf[0];
3023 nkf_char c1 = ptr->buf[1];
3025 set_code_score(ptr, SCORE_ERROR);
3026 }else if (c2 == SS2){
3027 set_code_score(ptr, SCORE_KANA);
3028 }else if (c2 == 0x8f){
3029 if ((c1 & 0x70) == 0x20){
3030 set_code_score(ptr, score_table_8FA0[c1 & 0x0f]);
3031 }else if ((c1 & 0x70) == 0x60){
3032 set_code_score(ptr, score_table_8FE0[c1 & 0x0f]);
3033 }else if ((c1 & 0x70) == 0x70){
3034 set_code_score(ptr, score_table_8FF0[c1 & 0x0f]);
3036 set_code_score(ptr, SCORE_X0212);
3038 #ifdef UTF8_OUTPUT_ENABLE
3039 }else if (!e2w_conv(c2, c1)){
3040 set_code_score(ptr, SCORE_NO_EXIST);
3042 }else if ((c2 & 0x70) == 0x20){
3043 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
3044 }else if ((c2 & 0x70) == 0x70){
3045 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
3046 }else if ((c2 & 0x70) >= 0x50){
3047 set_code_score(ptr, SCORE_L2);
3052 status_disable(struct input_code *ptr)
3057 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
3061 status_push_ch(struct input_code *ptr, nkf_char c)
3063 ptr->buf[ptr->index++] = c;
3067 status_clear(struct input_code *ptr)
3074 status_reset(struct input_code *ptr)
3077 ptr->score = SCORE_INIT;
3081 status_reinit(struct input_code *ptr)
3084 ptr->_file_stat = 0;
3088 status_check(struct input_code *ptr, nkf_char c)
3090 if (c <= DEL && estab_f){
3096 s_status(struct input_code *ptr, nkf_char c)
3100 status_check(ptr, c);
3105 }else if (nkf_char_unicode_p(c)){
3107 }else if (0xa1 <= c && c <= 0xdf){
3108 status_push_ch(ptr, SS2);
3109 status_push_ch(ptr, c);
3112 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
3114 status_push_ch(ptr, c);
3115 }else if (0xed <= c && c <= 0xee){
3117 status_push_ch(ptr, c);
3118 #ifdef SHIFTJIS_CP932
3119 }else if (is_ibmext_in_sjis(c)){
3121 status_push_ch(ptr, c);
3122 #endif /* SHIFTJIS_CP932 */
3124 }else if (0xf0 <= c && c <= 0xfc){
3126 status_push_ch(ptr, c);
3127 #endif /* X0212_ENABLE */
3129 status_disable(ptr);
3133 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3134 status_push_ch(ptr, c);
3135 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3139 status_disable(ptr);
3143 #ifdef SHIFTJIS_CP932
3144 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
3145 status_push_ch(ptr, c);
3146 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
3147 set_code_score(ptr, SCORE_CP932);
3152 #endif /* SHIFTJIS_CP932 */
3153 status_disable(ptr);
3156 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3157 status_push_ch(ptr, c);
3158 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3159 set_code_score(ptr, SCORE_CP932);
3162 status_disable(ptr);
3169 e_status(struct input_code *ptr, nkf_char c)
3173 status_check(ptr, c);
3178 }else if (nkf_char_unicode_p(c)){
3180 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
3182 status_push_ch(ptr, c);
3184 }else if (0x8f == c){
3186 status_push_ch(ptr, c);
3187 #endif /* X0212_ENABLE */
3189 status_disable(ptr);
3193 if (0xa1 <= c && c <= 0xfe){
3194 status_push_ch(ptr, c);
3198 status_disable(ptr);
3203 if (0xa1 <= c && c <= 0xfe){
3205 status_push_ch(ptr, c);
3207 status_disable(ptr);
3209 #endif /* X0212_ENABLE */
3213 #ifdef UTF8_INPUT_ENABLE
3215 w_status(struct input_code *ptr, nkf_char c)
3219 status_check(ptr, c);
3224 }else if (nkf_char_unicode_p(c)){
3226 }else if (0xc0 <= c && c <= 0xdf){
3228 status_push_ch(ptr, c);
3229 }else if (0xe0 <= c && c <= 0xef){
3231 status_push_ch(ptr, c);
3232 }else if (0xf0 <= c && c <= 0xf4){
3234 status_push_ch(ptr, c);
3236 status_disable(ptr);
3241 if (0x80 <= c && c <= 0xbf){
3242 status_push_ch(ptr, c);
3243 if (ptr->index > ptr->stat){
3244 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
3245 && ptr->buf[2] == 0xbf);
3246 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
3247 &ptr->buf[0], &ptr->buf[1]);
3254 status_disable(ptr);
3258 if (0x80 <= c && c <= 0xbf){
3259 if (ptr->index < ptr->stat){
3260 status_push_ch(ptr, c);
3265 status_disable(ptr);
3273 code_status(nkf_char c)
3275 int action_flag = 1;
3276 struct input_code *result = 0;
3277 struct input_code *p = input_code_list;
3279 if (!p->status_func) {
3283 if (!p->status_func)
3285 (p->status_func)(p, c);
3288 }else if(p->stat == 0){
3299 if (result && !estab_f){
3300 set_iconv(TRUE, result->iconv_func);
3301 }else if (c <= DEL){
3302 struct input_code *ptr = input_code_list;
3312 nkf_buf_t *std_gc_buf;
3313 nkf_char broken_state;
3314 nkf_buf_t *broken_buf;
3315 nkf_char mimeout_state;
3319 static nkf_state_t *nkf_state = NULL;
3321 #define STD_GC_BUFSIZE (256)
3324 nkf_state_init(void)
3327 nkf_buf_clear(nkf_state->std_gc_buf);
3328 nkf_buf_clear(nkf_state->broken_buf);
3329 nkf_buf_clear(nkf_state->nfc_buf);
3332 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3333 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3334 nkf_state->broken_buf = nkf_buf_new(3);
3335 nkf_state->nfc_buf = nkf_buf_new(9);
3337 nkf_state->broken_state = 0;
3338 nkf_state->mimeout_state = 0;
3345 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3346 return nkf_buf_pop(nkf_state->std_gc_buf);
3353 std_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3355 nkf_buf_push(nkf_state->std_gc_buf, c);
3361 std_putc(nkf_char c)
3368 static nkf_char hold_buf[HOLD_SIZE*2];
3369 static int hold_count = 0;
3371 push_hold_buf(nkf_char c2)
3373 if (hold_count >= HOLD_SIZE*2)
3375 hold_buf[hold_count++] = c2;
3376 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3380 h_conv(FILE *f, nkf_char c1, nkf_char c2)
3387 /** it must NOT be in the kanji shifte sequence */
3388 /** it must NOT be written in JIS7 */
3389 /** and it must be after 2 byte 8bit code */
3395 while ((c2 = (*i_getc)(f)) != EOF) {
3401 if (push_hold_buf(c2) == EOF || estab_f) {
3407 struct input_code *p = input_code_list;
3408 struct input_code *result = p;
3413 if (p->status_func && p->score < result->score) {
3418 set_iconv(TRUE, result->iconv_func);
3423 ** 1) EOF is detected, or
3424 ** 2) Code is established, or
3425 ** 3) Buffer is FULL (but last word is pushed)
3427 ** in 1) and 3) cases, we continue to use
3428 ** Kanji codes by oconv and leave estab_f unchanged.
3433 while (hold_index < hold_count){
3434 c1 = hold_buf[hold_index++];
3435 if (nkf_char_unicode_p(c1)) {
3439 else if (c1 <= DEL){
3442 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3443 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3447 if (hold_index < hold_count){
3448 c2 = hold_buf[hold_index++];
3459 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3462 if (hold_index < hold_count){
3463 c3 = hold_buf[hold_index++];
3464 } else if ((c3 = (*i_getc)(f)) == EOF) {
3469 if (hold_index < hold_count){
3470 c4 = hold_buf[hold_index++];
3471 } else if ((c4 = (*i_getc)(f)) == EOF) {
3476 (*iconv)(c1, c2, (c3<<8)|c4);
3479 /* 4 bytes UTF-8 (check combining character) */
3480 if (hold_index < hold_count){
3481 c3 = hold_buf[hold_index++];
3483 } else if ((c3 = (*i_getc)(f)) == EOF) {
3484 w_iconv_nocombine(c1, c2, 0);
3487 if (hold_index < hold_count){
3488 c4 = hold_buf[hold_index++];
3490 } else if ((c4 = (*i_getc)(f)) == EOF) {
3491 w_iconv_nocombine(c1, c2, 0);
3492 if (fromhold_count <= 2)
3498 if (w_iconv_combine(c1, c2, 0, c3, c4, 0)) {
3499 w_iconv_nocombine(c1, c2, 0);
3500 if (fromhold_count <= 2) {
3503 } else if (fromhold_count == 3) {
3512 /* 3 bytes EUC or UTF-8 */
3513 if (hold_index < hold_count){
3514 c3 = hold_buf[hold_index++];
3516 } else if ((c3 = (*i_getc)(f)) == EOF) {
3522 if ((*iconv)(c1, c2, c3) == -3) {
3523 /* 6 bytes UTF-8 (check combining character) */
3525 if (hold_index < hold_count){
3526 c4 = hold_buf[hold_index++];
3528 } else if ((c4 = (*i_getc)(f)) == EOF) {
3529 w_iconv_nocombine(c1, c2, c3);
3532 if (hold_index < hold_count){
3533 c5 = hold_buf[hold_index++];
3535 } else if ((c5 = (*i_getc)(f)) == EOF) {
3536 w_iconv_nocombine(c1, c2, c3);
3537 if (fromhold_count == 4)
3543 if (hold_index < hold_count){
3544 c6 = hold_buf[hold_index++];
3546 } else if ((c6 = (*i_getc)(f)) == EOF) {
3547 w_iconv_nocombine(c1, c2, c3);
3548 if (fromhold_count == 5) {
3550 } else if (fromhold_count == 4) {
3559 if (w_iconv_combine(c1, c2, c3, c4, c5, c6)) {
3560 w_iconv_nocombine(c1, c2, c3);
3561 if (fromhold_count == 6) {
3563 } else if (fromhold_count == 5) {
3566 } else if (fromhold_count == 4) {
3579 if (c3 == EOF) break;
3585 * Check and Ignore BOM
3591 input_bom_f = FALSE;
3592 switch(c2 = (*i_getc)(f)){
3594 if((c2 = (*i_getc)(f)) == 0x00){
3595 if((c2 = (*i_getc)(f)) == 0xFE){
3596 if((c2 = (*i_getc)(f)) == 0xFF){
3597 if(!input_encoding){
3598 set_iconv(TRUE, w_iconv32);
3600 if (iconv == w_iconv32) {
3602 input_endian = ENDIAN_BIG;
3605 (*i_ungetc)(0xFF,f);
3606 }else (*i_ungetc)(c2,f);
3607 (*i_ungetc)(0xFE,f);
3608 }else if(c2 == 0xFF){
3609 if((c2 = (*i_getc)(f)) == 0xFE){
3610 if(!input_encoding){
3611 set_iconv(TRUE, w_iconv32);
3613 if (iconv == w_iconv32) {
3614 input_endian = ENDIAN_2143;
3617 (*i_ungetc)(0xFF,f);
3618 }else (*i_ungetc)(c2,f);
3619 (*i_ungetc)(0xFF,f);
3620 }else (*i_ungetc)(c2,f);
3621 (*i_ungetc)(0x00,f);
3622 }else (*i_ungetc)(c2,f);
3623 (*i_ungetc)(0x00,f);
3626 if((c2 = (*i_getc)(f)) == 0xBB){
3627 if((c2 = (*i_getc)(f)) == 0xBF){
3628 if(!input_encoding){
3629 set_iconv(TRUE, w_iconv);
3631 if (iconv == w_iconv) {
3635 (*i_ungetc)(0xBF,f);
3636 }else (*i_ungetc)(c2,f);
3637 (*i_ungetc)(0xBB,f);
3638 }else (*i_ungetc)(c2,f);
3639 (*i_ungetc)(0xEF,f);
3642 if((c2 = (*i_getc)(f)) == 0xFF){
3643 if((c2 = (*i_getc)(f)) == 0x00){
3644 if((c2 = (*i_getc)(f)) == 0x00){
3645 if(!input_encoding){
3646 set_iconv(TRUE, w_iconv32);
3648 if (iconv == w_iconv32) {
3649 input_endian = ENDIAN_3412;
3652 (*i_ungetc)(0x00,f);
3653 }else (*i_ungetc)(c2,f);
3654 (*i_ungetc)(0x00,f);
3655 }else (*i_ungetc)(c2,f);
3656 if(!input_encoding){
3657 set_iconv(TRUE, w_iconv16);
3659 if (iconv == w_iconv16) {
3660 input_endian = ENDIAN_BIG;
3664 (*i_ungetc)(0xFF,f);
3665 }else (*i_ungetc)(c2,f);
3666 (*i_ungetc)(0xFE,f);
3669 if((c2 = (*i_getc)(f)) == 0xFE){
3670 if((c2 = (*i_getc)(f)) == 0x00){
3671 if((c2 = (*i_getc)(f)) == 0x00){
3672 if(!input_encoding){
3673 set_iconv(TRUE, w_iconv32);
3675 if (iconv == w_iconv32) {
3676 input_endian = ENDIAN_LITTLE;
3680 (*i_ungetc)(0x00,f);
3681 }else (*i_ungetc)(c2,f);
3682 (*i_ungetc)(0x00,f);
3683 }else (*i_ungetc)(c2,f);
3684 if(!input_encoding){
3685 set_iconv(TRUE, w_iconv16);
3687 if (iconv == w_iconv16) {
3688 input_endian = ENDIAN_LITTLE;
3692 (*i_ungetc)(0xFE,f);
3693 }else (*i_ungetc)(c2,f);
3694 (*i_ungetc)(0xFF,f);
3703 broken_getc(FILE *f)
3707 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3708 return nkf_buf_pop(nkf_state->broken_buf);
3711 if (c=='$' && nkf_state->broken_state != ESC
3712 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3714 nkf_state->broken_state = 0;
3715 if (c1=='@'|| c1=='B') {
3716 nkf_buf_push(nkf_state->broken_buf, c1);
3717 nkf_buf_push(nkf_state->broken_buf, c);
3723 } else if (c=='(' && nkf_state->broken_state != ESC
3724 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3726 nkf_state->broken_state = 0;
3727 if (c1=='J'|| c1=='B') {
3728 nkf_buf_push(nkf_state->broken_buf, c1);
3729 nkf_buf_push(nkf_state->broken_buf, c);
3736 nkf_state->broken_state = c;
3742 broken_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3744 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3745 nkf_buf_push(nkf_state->broken_buf, c);
3750 eol_conv(nkf_char c2, nkf_char c1)
3752 if (guess_f && input_eol != EOF) {
3753 if (c2 == 0 && c1 == LF) {
3754 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3755 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3756 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3758 else if (!input_eol) input_eol = CR;
3759 else if (input_eol != CR) input_eol = EOF;
3761 if (prev_cr || (c2 == 0 && c1 == LF)) {
3763 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3764 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3766 if (c2 == 0 && c1 == CR) prev_cr = CR;
3767 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3771 put_newline(void (*func)(nkf_char))
3773 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3788 oconv_newline(void (*func)(nkf_char, nkf_char))
3790 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3805 Return value of fold_conv()
3807 LF add newline and output char
3808 CR add newline and output nothing
3811 1 (or else) normal output
3813 fold state in prev (previous character)
3815 >0x80 Japanese (X0208/X0201)
3820 This fold algorthm does not preserve heading space in a line.
3821 This is the main difference from fmt.
3824 #define char_size(c2,c1) (c2?2:1)
3827 fold_conv(nkf_char c2, nkf_char c1)
3830 nkf_char fold_state;
3832 if (c1== CR && !fold_preserve_f) {
3833 fold_state=0; /* ignore cr */
3834 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3836 fold_state=0; /* ignore cr */
3837 } else if (c1== BS) {
3838 if (f_line>0) f_line--;
3840 } else if (c2==EOF && f_line != 0) { /* close open last line */
3842 } else if ((c1==LF && !fold_preserve_f)
3843 || ((c1==CR||(c1==LF&&f_prev!=CR))
3844 && fold_preserve_f)) {
3846 if (fold_preserve_f) {
3850 } else if ((f_prev == c1)
3852 ) { /* duplicate newline */
3855 fold_state = LF; /* output two newline */
3861 if (f_prev&0x80) { /* Japanese? */
3863 fold_state = 0; /* ignore given single newline */
3864 } else if (f_prev==SP) {
3868 if (++f_line<=fold_len)
3872 fold_state = CR; /* fold and output nothing */
3876 } else if (c1=='\f') {
3879 fold_state = LF; /* output newline and clear */
3880 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3881 /* X0208 kankaku or ascii space */
3883 fold_state = 0; /* remove duplicate spaces */
3886 if (++f_line<=fold_len)
3887 fold_state = SP; /* output ASCII space only */
3889 f_prev = SP; f_line = 0;
3890 fold_state = CR; /* fold and output nothing */
3894 prev0 = f_prev; /* we still need this one... , but almost done */
3896 if (c2 || c2 == JIS_X_0201_1976_K)
3897 f_prev |= 0x80; /* this is Japanese */
3898 f_line += c2 == JIS_X_0201_1976_K ? 1: char_size(c2,c1);
3899 if (f_line<=fold_len) { /* normal case */
3902 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3903 f_line = char_size(c2,c1);
3904 fold_state = LF; /* We can't wait, do fold now */
3905 } else if (c2 == JIS_X_0201_1976_K) {
3906 /* simple kinsoku rules return 1 means no folding */
3907 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3908 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3909 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3910 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3911 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3912 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3913 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3915 fold_state = LF;/* add one new f_line before this character */
3918 fold_state = LF;/* add one new f_line before this character */
3921 /* kinsoku point in ASCII */
3922 if ( c1==')'|| /* { [ ( */
3933 /* just after special */
3934 } else if (!is_alnum(prev0)) {
3935 f_line = char_size(c2,c1);
3937 } else if ((prev0==SP) || /* ignored new f_line */
3938 (prev0==LF)|| /* ignored new f_line */
3939 (prev0&0x80)) { /* X0208 - ASCII */
3940 f_line = char_size(c2,c1);
3941 fold_state = LF;/* add one new f_line before this character */
3943 fold_state = 1; /* default no fold in ASCII */
3947 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3948 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3949 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3950 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3951 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3952 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3953 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3954 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3955 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3956 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3957 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3958 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3959 /* default no fold in kinsoku */
3962 f_line = char_size(c2,c1);
3963 /* add one new f_line before this character */
3966 f_line = char_size(c2,c1);
3968 /* add one new f_line before this character */
3973 /* terminator process */
3974 switch(fold_state) {
3976 oconv_newline(o_fconv);
3982 oconv_newline(o_fconv);
3993 static nkf_char z_prev2=0,z_prev1=0;
3996 z_conv(nkf_char c2, nkf_char c1)
3999 /* if (c2) c1 &= 0x7f; assertion */
4001 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4007 if (z_prev2 == JIS_X_0201_1976_K) {
4008 if (c2 == JIS_X_0201_1976_K) {
4009 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4011 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4013 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4015 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4017 } else if (x0213_f && c1 == (0xdf&0x7f) && ev_x0213[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4019 (*o_zconv)(ev_x0213[(z_prev1-SP)*2], ev_x0213[(z_prev1-SP)*2+1]);
4024 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4026 if (c2 == JIS_X_0201_1976_K) {
4027 if (dv[(c1-SP)*2] || ev[(c1-SP)*2] || (x0213_f && ev_x0213[(c1-SP)*2])) {
4028 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4033 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4044 if (alpha_f&1 && c2 == 0x23) {
4045 /* JISX0208 Alphabet */
4047 } else if (c2 == 0x21) {
4048 /* JISX0208 Kigou */
4053 } else if (alpha_f&4) {
4058 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4064 if (alpha_f&8 && c2 == 0) {
4066 const char *entity = 0;
4068 case '>': entity = ">"; break;
4069 case '<': entity = "<"; break;
4070 case '\"': entity = """; break;
4071 case '&': entity = "&"; break;
4074 while (*entity) (*o_zconv)(0, *entity++);
4080 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4085 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4089 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4093 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4097 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4101 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4105 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4109 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4113 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4118 (*o_zconv)(JIS_X_0201_1976_K, c);
4121 } else if (c2 == 0x25) {
4122 /* JISX0208 Katakana */
4123 static const int fullwidth_to_halfwidth[] =
4125 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4126 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4127 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4128 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4129 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4130 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4131 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4132 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4133 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4134 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4135 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x365F,
4136 0x375F, 0x385F, 0x395F, 0x3A5F, 0x3E5F, 0x425F, 0x445F, 0x0000
4138 if (fullwidth_to_halfwidth[c1-0x20]){
4139 c2 = fullwidth_to_halfwidth[c1-0x20];
4140 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
4142 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
4146 } else if (c2 == 0 && nkf_char_unicode_p(c1) &&
4147 ((c1&VALUE_MASK) == 0x3099 || (c1&VALUE_MASK) == 0x309A)) { /*
\e$B9g@.MQByE@!&H>ByE@
\e(B */
4148 (*o_zconv)(JIS_X_0201_1976_K, 0x5E + (c1&VALUE_MASK) - 0x3099);
4156 #define rot13(c) ( \
4158 (c <= 'M') ? (c + 13): \
4159 (c <= 'Z') ? (c - 13): \
4161 (c <= 'm') ? (c + 13): \
4162 (c <= 'z') ? (c - 13): \
4166 #define rot47(c) ( \
4168 ( c <= 'O') ? (c + 47) : \
4169 ( c <= '~') ? (c - 47) : \
4174 rot_conv(nkf_char c2, nkf_char c1)
4176 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
4182 (*o_rot_conv)(c2,c1);
4186 hira_conv(nkf_char c2, nkf_char c1)
4190 if (0x20 < c1 && c1 < 0x74) {
4192 (*o_hira_conv)(c2,c1);
4194 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
4196 c1 = nkf_char_unicode_new(0x3094);
4197 (*o_hira_conv)(c2,c1);
4200 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4202 (*o_hira_conv)(c2,c1);
4207 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
4210 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4212 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4216 (*o_hira_conv)(c2,c1);
4221 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4223 #define RANGE_NUM_MAX 18
4224 static const nkf_char range[RANGE_NUM_MAX][2] = {
4245 nkf_char start, end, c;
4247 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4251 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4256 for (i = 0; i < RANGE_NUM_MAX; i++) {
4257 start = range[i][0];
4260 if (c >= start && c <= end) {
4265 (*o_iso2022jp_check_conv)(c2,c1);
4269 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4271 static const unsigned char *mime_pattern[] = {
4272 (const unsigned char *)"\075?EUC-JP?B?",
4273 (const unsigned char *)"\075?SHIFT_JIS?B?",
4274 (const unsigned char *)"\075?ISO-8859-1?Q?",
4275 (const unsigned char *)"\075?ISO-8859-1?B?",
4276 (const unsigned char *)"\075?ISO-2022-JP?B?",
4277 (const unsigned char *)"\075?ISO-2022-JP?B?",
4278 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4279 #if defined(UTF8_INPUT_ENABLE)
4280 (const unsigned char *)"\075?UTF-8?B?",
4281 (const unsigned char *)"\075?UTF-8?Q?",
4283 (const unsigned char *)"\075?US-ASCII?Q?",
4288 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4289 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4290 e_iconv, s_iconv, 0, 0, 0, 0, 0,
4291 #if defined(UTF8_INPUT_ENABLE)
4297 static const nkf_char mime_encode[] = {
4298 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K,
4299 #if defined(UTF8_INPUT_ENABLE)
4306 static const nkf_char mime_encode_method[] = {
4307 'B', 'B','Q', 'B', 'B', 'B', 'Q',
4308 #if defined(UTF8_INPUT_ENABLE)
4316 /* MIME preprocessor fifo */
4318 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
4319 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
4320 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
4322 unsigned char buf[MIME_BUF_SIZE];
4324 unsigned int last; /* decoded */
4325 unsigned int input; /* undecoded */
4327 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
4329 #define MAXRECOVER 20
4332 mime_input_buf_unshift(nkf_char c)
4334 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
4338 mime_ungetc(nkf_char c, ARG_UNUSED FILE *f)
4340 mime_input_buf_unshift(c);
4345 mime_ungetc_buf(nkf_char c, FILE *f)
4348 (*i_mungetc_buf)(c,f);
4350 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
4355 mime_getc_buf(FILE *f)
4357 /* we don't keep eof of mime_input_buf, because it contains ?= as
4358 a terminator. It was checked in mime_integrity. */
4359 return ((mimebuf_f)?
4360 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
4364 switch_mime_getc(void)
4366 if (i_getc!=mime_getc) {
4367 i_mgetc = i_getc; i_getc = mime_getc;
4368 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4369 if(mime_f==STRICT_MIME) {
4370 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4371 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4377 unswitch_mime_getc(void)
4379 if(mime_f==STRICT_MIME) {
4380 i_mgetc = i_mgetc_buf;
4381 i_mungetc = i_mungetc_buf;
4384 i_ungetc = i_mungetc;
4385 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4386 mime_iconv_back = NULL;
4390 mime_integrity(FILE *f, const unsigned char *p)
4394 /* In buffered mode, read until =? or NL or buffer full
4396 mime_input_state.input = mime_input_state.top;
4397 mime_input_state.last = mime_input_state.top;
4399 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
4401 q = mime_input_state.input;
4402 while((c=(*i_getc)(f))!=EOF) {
4403 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
4404 break; /* buffer full */
4406 if (c=='=' && d=='?') {
4407 /* checked. skip header, start decode */
4408 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4409 /* mime_last_input = mime_input_state.input; */
4410 mime_input_state.input = q;
4414 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4416 /* Should we check length mod 4? */
4417 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4420 /* In case of Incomplete MIME, no MIME decode */
4421 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4422 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
4423 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
4424 switch_mime_getc(); /* anyway we need buffered getc */
4429 mime_begin_strict(FILE *f)
4433 const unsigned char *p,*q;
4434 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4436 mime_decode_mode = FALSE;
4437 /* =? has been checked */
4439 p = mime_pattern[j];
4442 for(i=2;p[i]>SP;i++) { /* start at =? */
4443 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4444 /* pattern fails, try next one */
4446 while (mime_pattern[++j]) {
4447 p = mime_pattern[j];
4448 for(k=2;k<i;k++) /* assume length(p) > i */
4449 if (p[k]!=q[k]) break;
4450 if (k==i && nkf_toupper(c1)==p[k]) break;
4452 p = mime_pattern[j];
4453 if (p) continue; /* found next one, continue */
4454 /* all fails, output from recovery buffer */
4462 mime_decode_mode = p[i-2];
4464 mime_iconv_back = iconv;
4465 set_iconv(FALSE, mime_priority_func[j]);
4466 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4468 if (mime_decode_mode=='B') {
4469 mimebuf_f = unbuf_f;
4471 /* do MIME integrity check */
4472 return mime_integrity(f,mime_pattern[j]);
4486 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4487 /* re-read and convert again from mime_buffer. */
4489 /* =? has been checked */
4490 k = mime_input_state.last;
4491 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4492 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4493 /* We accept any character type even if it is breaked by new lines */
4494 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4495 if (c1==LF||c1==SP||c1==CR||
4496 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4498 /* Failed. But this could be another MIME preemble */
4500 mime_input_state.last--;
4506 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4507 if (!(++i<MAXRECOVER) || c1==EOF) break;
4508 if (c1=='b'||c1=='B') {
4509 mime_decode_mode = 'B';
4510 } else if (c1=='q'||c1=='Q') {
4511 mime_decode_mode = 'Q';
4515 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4516 if (!(++i<MAXRECOVER) || c1==EOF) break;
4518 mime_decode_mode = FALSE;
4524 if (!mime_decode_mode) {
4525 /* false MIME premble, restart from mime_buffer */
4526 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4527 /* Since we are in MIME mode until buffer becomes empty, */
4528 /* we never go into mime_begin again for a while. */
4531 /* discard mime preemble, and goto MIME mode */
4532 mime_input_state.last = k;
4533 /* do no MIME integrity check */
4534 return c1; /* used only for checking EOF */
4539 no_putc(ARG_UNUSED nkf_char c)
4545 debug(const char *str)
4548 fprintf(stderr, "%s\n", str ? str : "NULL");
4554 set_input_codename(const char *codename)
4556 if (!input_codename) {
4557 input_codename = codename;
4558 } else if (strcmp(codename, input_codename) != 0) {
4559 input_codename = "";
4564 get_guessed_code(void)
4566 if (input_codename && !*input_codename) {
4567 input_codename = "BINARY";
4569 struct input_code *p = find_inputcode_byfunc(iconv);
4570 if (!input_codename) {
4571 input_codename = "ASCII";
4572 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4573 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4574 input_codename = "CP932";
4575 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4576 if (p->score & SCORE_X0213)
4577 input_codename = "EUC-JIS-2004";
4578 else if (p->score & (SCORE_X0212))
4579 input_codename = "EUCJP-MS";
4580 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4581 input_codename = "CP51932";
4582 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4583 if (p->score & (SCORE_KANA))
4584 input_codename = "CP50221";
4585 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4586 input_codename = "CP50220";
4589 return input_codename;
4592 #if !defined(PERL_XS) && !defined(WIN32DLL)
4594 print_guessed_code(char *filename)
4596 if (filename != NULL) printf("%s: ", filename);
4597 if (input_codename && !*input_codename) {
4600 input_codename = get_guessed_code();
4602 printf("%s\n", input_codename);
4604 printf("%s%s%s%s\n",
4606 iconv != w_iconv16 && iconv != w_iconv32 ? "" :
4607 input_endian == ENDIAN_LITTLE ? " LE" :
4608 input_endian == ENDIAN_BIG ? " BE" :
4610 input_bom_f ? " (BOM)" : "",
4611 input_eol == CR ? " (CR)" :
4612 input_eol == LF ? " (LF)" :
4613 input_eol == CRLF ? " (CRLF)" :
4614 input_eol == EOF ? " (MIXED NL)" :
4624 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4626 nkf_char c1, c2, c3;
4632 if (!nkf_isxdigit(c2)){
4637 if (!nkf_isxdigit(c3)){
4642 return (hex2bin(c2) << 4) | hex2bin(c3);
4648 return hex_getc(':', f, i_cgetc, i_cungetc);
4652 cap_ungetc(nkf_char c, FILE *f)
4654 return (*i_cungetc)(c, f);
4660 return hex_getc('%', f, i_ugetc, i_uungetc);
4664 url_ungetc(nkf_char c, FILE *f)
4666 return (*i_uungetc)(c, f);
4670 #ifdef NUMCHAR_OPTION
4672 numchar_getc(FILE *f)
4674 nkf_char (*g)(FILE *) = i_ngetc;
4675 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4686 if (buf[i] == 'x' || buf[i] == 'X'){
4687 for (j = 0; j < 7; j++){
4689 if (!nkf_isxdigit(buf[i])){
4696 c |= hex2bin(buf[i]);
4699 for (j = 0; j < 8; j++){
4703 if (!nkf_isdigit(buf[i])){
4710 c += hex2bin(buf[i]);
4716 return nkf_char_unicode_new(c);
4726 numchar_ungetc(nkf_char c, FILE *f)
4728 return (*i_nungetc)(c, f);
4732 #ifdef UNICODE_NORMALIZATION
4737 nkf_char (*g)(FILE *f) = i_nfc_getc;
4738 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4739 nkf_buf_t *buf = nkf_state->nfc_buf;
4740 const unsigned char *array;
4741 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4742 nkf_char c = (*g)(f);
4744 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4746 nkf_buf_push(buf, c);
4748 while (lower <= upper) {
4749 int mid = (lower+upper) / 2;
4751 array = normalization_table[mid].nfd;
4752 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4753 if (len >= nkf_buf_length(buf)) {
4757 lower = 1, upper = 0;
4760 nkf_buf_push(buf, c);
4762 if (array[len] != nkf_buf_at(buf, len)) {
4763 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4764 else upper = mid - 1;
4771 array = normalization_table[mid].nfc;
4773 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4774 nkf_buf_push(buf, array[i]);
4778 } while (lower <= upper);
4780 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4781 c = nkf_buf_pop(buf);
4787 nfc_ungetc(nkf_char c, FILE *f)
4789 return (*i_nfc_ungetc)(c, f);
4791 #endif /* UNICODE_NORMALIZATION */
4795 base64decode(nkf_char c)
4800 i = c - 'A'; /* A..Z 0-25 */
4801 } else if (c == '_') {
4802 i = '?' /* 63 */ ; /* _ 63 */
4804 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4806 } else if (c > '/') {
4807 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4808 } else if (c == '+' || c == '-') {
4809 i = '>' /* 62 */ ; /* + and - 62 */
4811 i = '?' /* 63 */ ; /* / 63 */
4819 nkf_char c1, c2, c3, c4, cc;
4820 nkf_char t1, t2, t3, t4, mode, exit_mode;
4821 nkf_char lwsp_count;
4824 nkf_char lwsp_size = 128;
4826 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4827 return mime_input_buf(mime_input_state.top++);
4829 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4830 mime_decode_mode=FALSE;
4831 unswitch_mime_getc();
4832 return (*i_getc)(f);
4835 if (mimebuf_f == FIXED_MIME)
4836 exit_mode = mime_decode_mode;
4839 if (mime_decode_mode == 'Q') {
4840 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4842 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4843 if (c1<=SP || DEL<=c1) {
4844 mime_decode_mode = exit_mode; /* prepare for quit */
4847 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4851 mime_decode_mode = exit_mode; /* prepare for quit */
4852 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4853 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4854 /* end Q encoding */
4855 input_mode = exit_mode;
4857 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4858 while ((c1=(*i_getc)(f))!=EOF) {
4863 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4871 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4872 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4887 lwsp_buf[lwsp_count] = (unsigned char)c1;
4888 if (lwsp_count++>lwsp_size){
4890 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4891 lwsp_buf = lwsp_buf_new;
4897 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4899 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4900 i_ungetc(lwsp_buf[lwsp_count],f);
4903 nkf_xfree(lwsp_buf);
4906 if (c1=='='&&c2<SP) { /* this is soft wrap */
4907 while((c1 = (*i_mgetc)(f)) <=SP) {
4908 if (c1 == EOF) return (EOF);
4910 mime_decode_mode = 'Q'; /* still in MIME */
4911 goto restart_mime_q;
4914 mime_decode_mode = 'Q'; /* still in MIME */
4918 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4919 if (c2<=SP) return c2;
4920 mime_decode_mode = 'Q'; /* still in MIME */
4921 return ((hex2bin(c2)<<4) + hex2bin(c3));
4924 if (mime_decode_mode != 'B') {
4925 mime_decode_mode = FALSE;
4926 return (*i_mgetc)(f);
4930 /* Base64 encoding */
4932 MIME allows line break in the middle of
4933 Base64, but we are very pessimistic in decoding
4934 in unbuf mode because MIME encoded code may broken by
4935 less or editor's control sequence (such as ESC-[-K in unbuffered
4936 mode. ignore incomplete MIME.
4938 mode = mime_decode_mode;
4939 mime_decode_mode = exit_mode; /* prepare for quit */
4941 while ((c1 = (*i_mgetc)(f))<=SP) {
4946 if ((c2 = (*i_mgetc)(f))<=SP) {
4949 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4950 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4953 if ((c1 == '?') && (c2 == '=')) {
4956 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4957 while ((c1=(*i_getc)(f))!=EOF) {
4962 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4970 if ((c1=(*i_getc)(f))!=EOF) {
4974 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4989 lwsp_buf[lwsp_count] = (unsigned char)c1;
4990 if (lwsp_count++>lwsp_size){
4992 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4993 lwsp_buf = lwsp_buf_new;
4999 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5001 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5002 i_ungetc(lwsp_buf[lwsp_count],f);
5005 nkf_xfree(lwsp_buf);
5009 if ((c3 = (*i_mgetc)(f))<=SP) {
5012 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5013 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5017 if ((c4 = (*i_mgetc)(f))<=SP) {
5020 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5021 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5025 mime_decode_mode = mode; /* still in MIME sigh... */
5027 /* BASE 64 decoding */
5029 t1 = 0x3f & base64decode(c1);
5030 t2 = 0x3f & base64decode(c2);
5031 t3 = 0x3f & base64decode(c3);
5032 t4 = 0x3f & base64decode(c4);
5033 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5035 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5036 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5038 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5039 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5041 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5046 return mime_input_buf(mime_input_state.top++);
5049 static const char basis_64[] =
5050 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5052 #define MIMEOUT_BUF_LENGTH 74
5054 unsigned char buf[MIMEOUT_BUF_LENGTH+1];
5058 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5061 open_mime(nkf_char mode)
5063 const unsigned char *p;
5066 p = mime_pattern[0];
5067 for(i=0;mime_pattern[i];i++) {
5068 if (mode == mime_encode[i]) {
5069 p = mime_pattern[i];
5073 mimeout_mode = mime_encode_method[i];
5075 if (base64_count>45) {
5076 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
5077 (*o_mputc)(mimeout_state.buf[i]);
5080 put_newline(o_mputc);
5083 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
5087 for (;i<mimeout_state.count;i++) {
5088 if (nkf_isspace(mimeout_state.buf[i])) {
5089 (*o_mputc)(mimeout_state.buf[i]);
5099 j = mimeout_state.count;
5100 mimeout_state.count = 0;
5102 mime_putc(mimeout_state.buf[i]);
5107 mime_prechar(nkf_char c2, nkf_char c1)
5109 if (mimeout_mode > 0){
5111 if (base64_count + mimeout_state.count/3*4> 73){
5112 (*o_base64conv)(EOF,0);
5113 oconv_newline(o_base64conv);
5114 (*o_base64conv)(0,SP);
5118 if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) {
5119 (*o_base64conv)(EOF,0);
5120 oconv_newline(o_base64conv);
5121 (*o_base64conv)(0,SP);
5127 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
5128 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5129 open_mime(output_mode);
5130 (*o_base64conv)(EOF,0);
5131 oconv_newline(o_base64conv);
5132 (*o_base64conv)(0,SP);
5151 switch(mimeout_mode) {
5156 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
5162 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
5167 if (mimeout_mode > 0) {
5168 if (mimeout_f!=FIXED_MIME) {
5170 } else if (mimeout_mode != 'Q')
5176 mimeout_addchar(nkf_char c)
5178 switch(mimeout_mode) {
5183 } else if(!nkf_isalnum(c)) {
5185 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5186 (*o_mputc)(bin2hex((c&0xf)));
5194 nkf_state->mimeout_state=c;
5195 (*o_mputc)(basis_64[c>>2]);
5200 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5201 nkf_state->mimeout_state=c;
5206 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
5207 (*o_mputc)(basis_64[c & 0x3F]);
5219 mime_putc(nkf_char c)
5224 if (mimeout_f == FIXED_MIME){
5225 if (mimeout_mode == 'Q'){
5226 if (base64_count > 71){
5227 if (c!=CR && c!=LF) {
5229 put_newline(o_mputc);
5234 if (base64_count > 71){
5236 put_newline(o_mputc);
5239 if (c == EOF) { /* c==EOF */
5243 if (c != EOF) { /* c==EOF */
5249 /* mimeout_f != FIXED_MIME */
5251 if (c == EOF) { /* c==EOF */
5252 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
5253 j = mimeout_state.count;
5254 mimeout_state.count = 0;
5256 if (mimeout_mode > 0) {
5257 if (!nkf_isblank(mimeout_state.buf[j-1])) {
5259 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
5262 mimeout_addchar(mimeout_state.buf[i]);
5266 mimeout_addchar(mimeout_state.buf[i]);
5270 mimeout_addchar(mimeout_state.buf[i]);
5276 mimeout_addchar(mimeout_state.buf[i]);
5282 if (mimeout_state.count > 0){
5283 lastchar = mimeout_state.buf[mimeout_state.count - 1];
5288 if (mimeout_mode=='Q') {
5289 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5290 if (c == CR || c == LF) {
5295 } else if (c <= SP) {
5297 if (base64_count > 70) {
5298 put_newline(o_mputc);
5301 if (!nkf_isblank(c)) {
5306 if (base64_count > 70) {
5308 put_newline(o_mputc);
5311 open_mime(output_mode);
5313 if (!nkf_noescape_mime(c)) {
5326 if (mimeout_mode <= 0) {
5327 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5328 output_mode == UTF_8)) {
5329 if (nkf_isspace(c)) {
5331 if (mimeout_mode == -1) {
5334 if (c==CR || c==LF) {
5336 open_mime(output_mode);
5342 for (i=0;i<mimeout_state.count;i++) {
5343 (*o_mputc)(mimeout_state.buf[i]);
5344 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
5355 mimeout_state.buf[0] = (char)c;
5356 mimeout_state.count = 1;
5358 if (base64_count > 1
5359 && base64_count + mimeout_state.count > 76
5360 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
5361 static const char *str = "boundary=\"";
5362 static int len = 10;
5365 for (; i < mimeout_state.count - len; ++i) {
5366 if (!strncmp((char *)(mimeout_state.buf+i), str, len)) {
5372 if (i == 0 || i == mimeout_state.count - len) {
5373 put_newline(o_mputc);
5375 if (!nkf_isspace(mimeout_state.buf[0])){
5382 for (j = 0; j <= i; ++j) {
5383 (*o_mputc)(mimeout_state.buf[j]);
5385 put_newline(o_mputc);
5387 for (; j <= mimeout_state.count; ++j) {
5388 mimeout_state.buf[j - i] = mimeout_state.buf[j];
5390 mimeout_state.count -= i;
5393 mimeout_state.buf[mimeout_state.count++] = (char)c;
5394 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5395 open_mime(output_mode);
5400 if (lastchar==CR || lastchar == LF){
5401 for (i=0;i<mimeout_state.count;i++) {
5402 (*o_mputc)(mimeout_state.buf[i]);
5405 mimeout_state.count = 0;
5408 for (i=0;i<mimeout_state.count-1;i++) {
5409 (*o_mputc)(mimeout_state.buf[i]);
5412 mimeout_state.buf[0] = SP;
5413 mimeout_state.count = 1;
5415 open_mime(output_mode);
5418 /* mimeout_mode == 'B', 1, 2 */
5419 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5420 output_mode == UTF_8)) {
5421 if (lastchar == CR || lastchar == LF){
5422 if (nkf_isblank(c)) {
5423 for (i=0;i<mimeout_state.count;i++) {
5424 mimeout_addchar(mimeout_state.buf[i]);
5426 mimeout_state.count = 0;
5429 for (i=0;i<mimeout_state.count;i++) {
5430 (*o_mputc)(mimeout_state.buf[i]);
5433 mimeout_state.count = 0;
5435 mimeout_state.buf[mimeout_state.count++] = (char)c;
5438 if (nkf_isspace(c)) {
5439 for (i=0;i<mimeout_state.count;i++) {
5440 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
5442 for (i=0;i<mimeout_state.count;i++) {
5443 (*o_mputc)(mimeout_state.buf[i]);
5446 mimeout_state.count = 0;
5449 mimeout_state.buf[mimeout_state.count++] = (char)c;
5450 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5452 for (j=0;j<mimeout_state.count;j++) {
5453 (*o_mputc)(mimeout_state.buf[j]);
5456 mimeout_state.count = 0;
5460 if (mimeout_state.count>0 && SP<c && c!='=') {
5461 mimeout_state.buf[mimeout_state.count++] = (char)c;
5462 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5463 j = mimeout_state.count;
5464 mimeout_state.count = 0;
5466 mimeout_addchar(mimeout_state.buf[i]);
5473 if (mimeout_state.count>0) {
5474 j = mimeout_state.count;
5475 mimeout_state.count = 0;
5477 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5479 mimeout_addchar(mimeout_state.buf[i]);
5485 (*o_mputc)(mimeout_state.buf[i]);
5487 open_mime(output_mode);
5494 base64_conv(nkf_char c2, nkf_char c1)
5496 mime_prechar(c2, c1);
5497 (*o_base64conv)(c2,c1);
5501 typedef struct nkf_iconv_t {
5504 size_t input_buffer_size;
5505 char *output_buffer;
5506 size_t output_buffer_size;
5510 nkf_iconv_new(char *tocode, char *fromcode)
5512 nkf_iconv_t converter;
5514 converter->input_buffer_size = IOBUF_SIZE;
5515 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5516 converter->output_buffer_size = IOBUF_SIZE * 2;
5517 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5518 converter->cd = iconv_open(tocode, fromcode);
5519 if (converter->cd == (iconv_t)-1)
5523 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5526 perror("can't iconv_open");
5532 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5534 size_t invalid = (size_t)0;
5535 char *input_buffer = converter->input_buffer;
5536 size_t input_length = (size_t)0;
5537 char *output_buffer = converter->output_buffer;
5538 size_t output_length = converter->output_buffer_size;
5543 while ((c = (*i_getc)(f)) != EOF) {
5544 input_buffer[input_length++] = c;
5545 if (input_length < converter->input_buffer_size) break;
5549 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5550 while (output_length-- > 0) {
5551 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5553 if (ret == (size_t) - 1) {
5556 if (input_buffer != converter->input_buffer)
5557 memmove(converter->input_buffer, input_buffer, input_length);
5560 converter->output_buffer_size *= 2;
5561 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5562 if (output_buffer == NULL) {
5563 perror("can't realloc");
5566 converter->output_buffer = output_buffer;
5569 perror("can't iconv");
5582 nkf_iconv_close(nkf_iconv_t *convert)
5584 nkf_xfree(converter->inbuf);
5585 nkf_xfree(converter->outbuf);
5586 iconv_close(converter->cd);
5595 struct input_code *p = input_code_list;
5607 mime_f = MIME_DECODE_DEFAULT;
5608 mime_decode_f = FALSE;
5613 x0201_f = NKF_UNSPECIFIED;
5614 iso2022jp_f = FALSE;
5615 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5616 ms_ucs_map_f = UCS_MAP_ASCII;
5618 #ifdef UTF8_INPUT_ENABLE
5619 no_cp932ext_f = FALSE;
5620 no_best_fit_chars_f = FALSE;
5621 encode_fallback = NULL;
5622 unicode_subchar = '?';
5623 input_endian = ENDIAN_BIG;
5625 #ifdef UTF8_OUTPUT_ENABLE
5626 output_bom_f = FALSE;
5627 output_endian = ENDIAN_BIG;
5629 #ifdef UNICODE_NORMALIZATION
5645 #ifdef SHIFTJIS_CP932
5655 for (i = 0; i < 256; i++){
5656 prefix_table[i] = 0;
5660 mimeout_state.count = 0;
5665 fold_preserve_f = FALSE;
5668 kanji_intro = DEFAULT_J;
5669 ascii_intro = DEFAULT_R;
5670 fold_margin = FOLD_MARGIN;
5671 o_zconv = no_connection;
5672 o_fconv = no_connection;
5673 o_eol_conv = no_connection;
5674 o_rot_conv = no_connection;
5675 o_hira_conv = no_connection;
5676 o_base64conv = no_connection;
5677 o_iso2022jp_check_conv = no_connection;
5680 i_ungetc = std_ungetc;
5682 i_bungetc = std_ungetc;
5685 i_mungetc = std_ungetc;
5686 i_mgetc_buf = std_getc;
5687 i_mungetc_buf = std_ungetc;
5688 output_mode = ASCII;
5690 mime_decode_mode = FALSE;
5696 z_prev2=0,z_prev1=0;
5698 iconv_for_check = 0;
5700 input_codename = NULL;
5701 input_encoding = NULL;
5702 output_encoding = NULL;
5710 module_connection(void)
5712 if (input_encoding) set_input_encoding(input_encoding);
5713 if (!output_encoding) {
5714 output_encoding = nkf_default_encoding();
5716 if (!output_encoding) {
5717 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5720 set_output_encoding(output_encoding);
5721 oconv = nkf_enc_to_oconv(output_encoding);
5723 if (nkf_enc_unicode_p(output_encoding))
5724 output_mode = UTF_8;
5726 if (x0201_f == NKF_UNSPECIFIED) {
5727 x0201_f = X0201_DEFAULT;
5730 /* replace continuation module, from output side */
5732 /* output redirection */
5734 if (noout_f || guess_f){
5741 if (mimeout_f == TRUE) {
5742 o_base64conv = oconv; oconv = base64_conv;
5744 /* base64_count = 0; */
5747 if (eolmode_f || guess_f) {
5748 o_eol_conv = oconv; oconv = eol_conv;
5751 o_rot_conv = oconv; oconv = rot_conv;
5754 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5757 o_hira_conv = oconv; oconv = hira_conv;
5760 o_fconv = oconv; oconv = fold_conv;
5763 if (alpha_f || x0201_f) {
5764 o_zconv = oconv; oconv = z_conv;
5768 i_ungetc = std_ungetc;
5769 /* input redirection */
5772 i_cgetc = i_getc; i_getc = cap_getc;
5773 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5776 i_ugetc = i_getc; i_getc = url_getc;
5777 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5780 #ifdef NUMCHAR_OPTION
5782 i_ngetc = i_getc; i_getc = numchar_getc;
5783 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5786 #ifdef UNICODE_NORMALIZATION
5788 i_nfc_getc = i_getc; i_getc = nfc_getc;
5789 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5792 if (mime_f && mimebuf_f==FIXED_MIME) {
5793 i_mgetc = i_getc; i_getc = mime_getc;
5794 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5797 i_bgetc = i_getc; i_getc = broken_getc;
5798 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5800 if (input_encoding) {
5801 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5803 set_iconv(FALSE, e_iconv);
5807 struct input_code *p = input_code_list;
5816 Conversion main loop. Code detection only.
5819 #if !defined(PERL_XS) && !defined(WIN32DLL)
5826 module_connection();
5827 while ((c = (*i_getc)(f)) != EOF)
5834 #define NEXT continue /* no output, get next */
5835 #define SKIP c2=0;continue /* no output, get next */
5836 #define MORE c2=c1;continue /* need one more byte */
5837 #define SEND (void)0 /* output c1 and c2, get next */
5838 #define LAST break /* end of loop, go closing */
5839 #define set_input_mode(mode) do { \
5840 input_mode = mode; \
5842 set_input_codename("ISO-2022-JP"); \
5843 debug("ISO-2022-JP"); \
5847 kanji_convert(FILE *f)
5849 nkf_char c1=0, c2=0, c3=0, c4=0;
5850 int shift_mode = 0; /* 0, 1, 2, 3 */
5852 int is_8bit = FALSE;
5854 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5859 output_mode = ASCII;
5861 if (module_connection() < 0) {
5862 #if !defined(PERL_XS) && !defined(WIN32DLL)
5863 fprintf(stderr, "no output encoding given\n");
5869 #ifdef UTF8_INPUT_ENABLE
5870 if(iconv == w_iconv32){
5871 while ((c1 = (*i_getc)(f)) != EOF &&
5872 (c2 = (*i_getc)(f)) != EOF &&
5873 (c3 = (*i_getc)(f)) != EOF &&
5874 (c4 = (*i_getc)(f)) != EOF) {
5875 nkf_char c5, c6, c7, c8;
5876 if (nkf_iconv_utf_32(c1, c2, c3, c4) == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5877 if ((c5 = (*i_getc)(f)) != EOF &&
5878 (c6 = (*i_getc)(f)) != EOF &&
5879 (c7 = (*i_getc)(f)) != EOF &&
5880 (c8 = (*i_getc)(f)) != EOF) {
5881 if (nkf_iconv_utf_32_combine(c1, c2, c3, c4, c5, c6, c7, c8)) {
5886 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5889 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5895 else if (iconv == w_iconv16) {
5896 while ((c1 = (*i_getc)(f)) != EOF &&
5897 (c2 = (*i_getc)(f)) != EOF) {
5898 size_t ret = nkf_iconv_utf_16(c1, c2, 0, 0);
5899 if (ret == NKF_ICONV_NEED_TWO_MORE_BYTES &&
5900 (c3 = (*i_getc)(f)) != EOF &&
5901 (c4 = (*i_getc)(f)) != EOF) {
5902 nkf_iconv_utf_16(c1, c2, c3, c4);
5903 } else if (ret == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5904 if ((c3 = (*i_getc)(f)) != EOF &&
5905 (c4 = (*i_getc)(f)) != EOF) {
5906 if (nkf_iconv_utf_16_combine(c1, c2, c3, c4)) {
5909 nkf_iconv_utf_16_nocombine(c1, c2);
5912 nkf_iconv_utf_16_nocombine(c1, c2);
5920 while ((c1 = (*i_getc)(f)) != EOF) {
5921 #ifdef INPUT_CODE_FIX
5922 if (!input_encoding)
5927 if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
5928 /* in case of 8th bit is on */
5929 if (!estab_f&&!mime_decode_mode) {
5930 /* in case of not established yet */
5931 /* It is still ambiguous */
5932 if (h_conv(f, c2, c1)==EOF) {
5940 /* in case of already established */
5942 /* ignore bogus code */
5950 /* 2nd byte of 7 bit code or SJIS */
5954 else if (nkf_char_unicode_p(c1)) {
5960 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5963 }else if (input_codename && input_codename[0] == 'I' &&
5964 0xA1 <= c1 && c1 <= 0xDF) {
5965 /* JIS X 0201 Katakana in 8bit JIS */
5966 c2 = JIS_X_0201_1976_K;
5969 } else if (c1 > DEL) {
5971 if (!estab_f && !iso8859_f) {
5972 /* not established yet */
5974 } else { /* estab_f==TRUE */
5980 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5981 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5983 c2 = JIS_X_0201_1976_K;
5988 /* already established */
5992 } else if (SP < c1 && c1 < DEL) {
5993 /* in case of Roman characters */
5995 /* output 1 shifted byte */
5999 } else if (nkf_byte_jisx0201_katakana_p(c1)){
6000 /* output 1 shifted byte */
6001 c2 = JIS_X_0201_1976_K;
6004 /* look like bogus code */
6007 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
6008 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
6009 /* in case of Kanji shifted */
6011 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
6012 /* Check MIME code */
6013 if ((c1 = (*i_getc)(f)) == EOF) {
6016 } else if (c1 == '?') {
6017 /* =? is mime conversion start sequence */
6018 if(mime_f == STRICT_MIME) {
6019 /* check in real detail */
6020 if (mime_begin_strict(f) == EOF)
6023 } else if (mime_begin(f) == EOF)
6032 /* normal ASCII code */
6035 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
6038 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
6041 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
6042 if ((c1 = (*i_getc)(f)) == EOF) {
6046 else if (c1 == '&') {
6048 if ((c1 = (*i_getc)(f)) == EOF) {
6054 else if (c1 == '$') {
6056 if ((c1 = (*i_getc)(f)) == EOF) {
6057 /* don't send bogus code
6059 (*oconv)(0, '$'); */
6061 } else if (c1 == '@' || c1 == 'B') {
6063 set_input_mode(JIS_X_0208);
6065 } else if (c1 == '(') {
6067 if ((c1 = (*i_getc)(f)) == EOF) {
6068 /* don't send bogus code
6074 } else if (c1 == '@'|| c1 == 'B') {
6076 set_input_mode(JIS_X_0208);
6079 } else if (c1 == 'D'){
6080 set_input_mode(JIS_X_0212);
6082 #endif /* X0212_ENABLE */
6083 } else if (c1 == 'O' || c1 == 'Q'){
6084 set_input_mode(JIS_X_0213_1);
6086 } else if (c1 == 'P'){
6087 set_input_mode(JIS_X_0213_2);
6090 /* could be some special code */
6097 } else if (broken_f&0x2) {
6098 /* accept any ESC-(-x as broken code ... */
6099 input_mode = JIS_X_0208;
6108 } else if (c1 == '(') {
6110 if ((c1 = (*i_getc)(f)) == EOF) {
6111 /* don't send bogus code
6113 (*oconv)(0, '('); */
6116 else if (c1 == 'I') {
6117 /* JIS X 0201 Katakana */
6118 set_input_mode(JIS_X_0201_1976_K);
6122 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
6123 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
6124 set_input_mode(ASCII);
6127 else if (broken_f&0x2) {
6128 set_input_mode(ASCII);
6137 else if (c1 == '.') {
6139 if ((c1 = (*i_getc)(f)) == EOF) {
6142 else if (c1 == 'A') {
6153 else if (c1 == 'N') {
6156 if (g2 == ISO_8859_1) {
6172 } else if (c1 == ESC && iconv == s_iconv) {
6173 /* ESC in Shift_JIS */
6174 if ((c1 = (*i_getc)(f)) == EOF) {
6177 } else if (c1 == '$') {
6179 if ((c1 = (*i_getc)(f)) == EOF) {
6181 } else if (('E' <= c1 && c1 <= 'G') ||
6182 ('O' <= c1 && c1 <= 'Q')) {
6190 static const nkf_char jphone_emoji_first_table[7] =
6191 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
6192 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
6193 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6194 while (SP <= c1 && c1 <= 'z') {
6195 (*oconv)(0, c1 + c3);
6196 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6212 } else if (c1 == LF || c1 == CR) {
6214 input_mode = ASCII; set_iconv(FALSE, 0);
6216 } else if (mime_decode_f && !mime_decode_mode){
6218 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
6226 } else { /* if (c1 == CR)*/
6227 if ((c1=(*i_getc)(f))!=EOF) {
6231 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
6251 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
6254 if ((c3 = (*i_getc)(f)) != EOF) {
6257 if ((c4 = (*i_getc)(f)) != EOF) {
6259 (*iconv)(c2, c1, c3|c4);
6264 /* 4 bytes UTF-8 (check combining character) */
6265 if ((c3 = (*i_getc)(f)) != EOF) {
6266 if ((c4 = (*i_getc)(f)) != EOF) {
6267 if (w_iconv_combine(c2, c1, 0, c3, c4, 0)) {
6270 w_iconv_nocombine(c2, c1, 0);
6274 w_iconv_nocombine(c2, c1, 0);
6277 w_iconv_nocombine(c2, c1, 0);
6281 /* 3 bytes EUC or UTF-8 */
6282 if ((c3 = (*i_getc)(f)) != EOF) {
6284 if ((*iconv)(c2, c1, c3) == -3) {
6285 /* 6 bytes UTF-8 (check combining character) */
6287 if ((c4 = (*i_getc)(f)) != EOF) {
6288 if ((c5 = (*i_getc)(f)) != EOF) {
6289 if ((c6 = (*i_getc)(f)) != EOF) {
6290 if (w_iconv_combine(c2, c1, c3, c4, c5, c6)) {
6294 w_iconv_nocombine(c2, c1, c3);
6299 w_iconv_nocombine(c2, c1, c3);
6303 w_iconv_nocombine(c2, c1, c3);
6306 w_iconv_nocombine(c2, c1, c3);
6316 0x7F <= c2 && c2 <= 0x92 &&
6317 0x21 <= c1 && c1 <= 0x7E) {
6319 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
6322 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
6326 (*oconv)(PREFIX_EUCG3 | c2, c1);
6328 #endif /* X0212_ENABLE */
6330 (*oconv)(PREFIX_EUCG3 | c2, c1);
6333 (*oconv)(input_mode, c1); /* other special case */
6339 /* goto next_word */
6344 (*iconv)(EOF, 0, 0);
6345 if (!input_codename)
6348 struct input_code *p = input_code_list;
6349 struct input_code *result = p;
6351 if (p->score < result->score) result = p;
6354 set_input_codename(result->name);
6356 debug(result->name);
6364 * int options(unsigned char *cp)
6371 options(unsigned char *cp)
6375 unsigned char *cp_back = NULL;
6380 while(*cp && *cp++!='-');
6381 while (*cp || cp_back) {
6389 case '-': /* literal options */
6390 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
6394 for (i=0;i<(int)(sizeof(long_option)/sizeof(long_option[0]));i++) {
6395 p = (unsigned char *)long_option[i].name;
6396 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
6397 if (*p == cp[j] || cp[j] == SP){
6404 #if !defined(PERL_XS) && !defined(WIN32DLL)
6405 fprintf(stderr, "unknown long option: --%s\n", cp);
6409 while(*cp && *cp != SP && cp++);
6410 if (long_option[i].alias[0]){
6412 cp = (unsigned char *)long_option[i].alias;
6415 if (strcmp(long_option[i].name, "help") == 0){
6420 if (strcmp(long_option[i].name, "ic=") == 0){
6421 enc = nkf_enc_find((char *)p);
6423 input_encoding = enc;
6426 if (strcmp(long_option[i].name, "oc=") == 0){
6427 enc = nkf_enc_find((char *)p);
6428 /* if (enc <= 0) continue; */
6430 output_encoding = enc;
6433 if (strcmp(long_option[i].name, "guess=") == 0){
6434 if (p[0] == '0' || p[0] == '1') {
6442 if (strcmp(long_option[i].name, "overwrite") == 0){
6445 preserve_time_f = TRUE;
6448 if (strcmp(long_option[i].name, "overwrite=") == 0){
6451 preserve_time_f = TRUE;
6453 backup_suffix = (char *)p;
6456 if (strcmp(long_option[i].name, "in-place") == 0){
6459 preserve_time_f = FALSE;
6462 if (strcmp(long_option[i].name, "in-place=") == 0){
6465 preserve_time_f = FALSE;
6467 backup_suffix = (char *)p;
6472 if (strcmp(long_option[i].name, "cap-input") == 0){
6476 if (strcmp(long_option[i].name, "url-input") == 0){
6481 #ifdef NUMCHAR_OPTION
6482 if (strcmp(long_option[i].name, "numchar-input") == 0){
6488 if (strcmp(long_option[i].name, "no-output") == 0){
6492 if (strcmp(long_option[i].name, "debug") == 0){
6497 if (strcmp(long_option[i].name, "cp932") == 0){
6498 #ifdef SHIFTJIS_CP932
6502 #ifdef UTF8_OUTPUT_ENABLE
6503 ms_ucs_map_f = UCS_MAP_CP932;
6507 if (strcmp(long_option[i].name, "no-cp932") == 0){
6508 #ifdef SHIFTJIS_CP932
6512 #ifdef UTF8_OUTPUT_ENABLE
6513 ms_ucs_map_f = UCS_MAP_ASCII;
6517 #ifdef SHIFTJIS_CP932
6518 if (strcmp(long_option[i].name, "cp932inv") == 0){
6525 if (strcmp(long_option[i].name, "x0212") == 0){
6532 if (strcmp(long_option[i].name, "exec-in") == 0){
6536 if (strcmp(long_option[i].name, "exec-out") == 0){
6541 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
6542 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
6543 no_cp932ext_f = TRUE;
6546 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
6547 no_best_fit_chars_f = TRUE;
6550 if (strcmp(long_option[i].name, "fb-skip") == 0){
6551 encode_fallback = NULL;
6554 if (strcmp(long_option[i].name, "fb-html") == 0){
6555 encode_fallback = encode_fallback_html;
6558 if (strcmp(long_option[i].name, "fb-xml") == 0){
6559 encode_fallback = encode_fallback_xml;
6562 if (strcmp(long_option[i].name, "fb-java") == 0){
6563 encode_fallback = encode_fallback_java;
6566 if (strcmp(long_option[i].name, "fb-perl") == 0){
6567 encode_fallback = encode_fallback_perl;
6570 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6571 encode_fallback = encode_fallback_subchar;
6574 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6575 encode_fallback = encode_fallback_subchar;
6576 unicode_subchar = 0;
6578 /* decimal number */
6579 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6580 unicode_subchar *= 10;
6581 unicode_subchar += hex2bin(p[i]);
6583 }else if(p[1] == 'x' || p[1] == 'X'){
6584 /* hexadecimal number */
6585 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6586 unicode_subchar <<= 4;
6587 unicode_subchar |= hex2bin(p[i]);
6591 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6592 unicode_subchar *= 8;
6593 unicode_subchar += hex2bin(p[i]);
6596 w16e_conv(unicode_subchar, &i, &j);
6597 unicode_subchar = i<<8 | j;
6601 #ifdef UTF8_OUTPUT_ENABLE
6602 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6603 ms_ucs_map_f = UCS_MAP_MS;
6607 #ifdef UNICODE_NORMALIZATION
6608 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6613 if (strcmp(long_option[i].name, "prefix=") == 0){
6614 if (nkf_isgraph(p[0])){
6615 for (i = 1; nkf_isgraph(p[i]); i++){
6616 prefix_table[p[i]] = p[0];
6621 #if !defined(PERL_XS) && !defined(WIN32DLL)
6622 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6627 case 'b': /* buffered mode */
6630 case 'u': /* non bufferd mode */
6633 case 't': /* transparent mode */
6638 } else if (*cp=='2') {
6642 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6650 case 'j': /* JIS output */
6652 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6654 case 'e': /* AT&T EUC output */
6655 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6657 case 's': /* SJIS output */
6658 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6660 case 'l': /* ISO8859 Latin-1 support, no conversion */
6661 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6662 input_encoding = nkf_enc_from_index(ISO_8859_1);
6664 case 'i': /* Kanji IN ESC-$-@/B */
6665 if (*cp=='@'||*cp=='B')
6666 kanji_intro = *cp++;
6668 case 'o': /* ASCII IN ESC-(-J/B/H */
6669 /* ESC ( H was used in initial JUNET messages */
6670 if (*cp=='J'||*cp=='B'||*cp=='H')
6671 ascii_intro = *cp++;
6675 bit:1 katakana->hiragana
6676 bit:2 hiragana->katakana
6678 if ('9'>= *cp && *cp>='0')
6679 hira_f |= (*cp++ -'0');
6686 #if defined(MSDOS) || defined(__OS2__)
6693 show_configuration();
6701 #ifdef UTF8_OUTPUT_ENABLE
6702 case 'w': /* UTF-{8,16,32} output */
6707 output_encoding = nkf_enc_from_index(UTF_8N);
6709 output_bom_f = TRUE;
6710 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6714 if ('1'== cp[0] && '6'==cp[1]) {
6717 } else if ('3'== cp[0] && '2'==cp[1]) {
6721 output_encoding = nkf_enc_from_index(UTF_8);
6726 output_endian = ENDIAN_LITTLE;
6727 output_bom_f = TRUE;
6728 } else if (cp[0] == 'B') {
6730 output_bom_f = TRUE;
6733 output_bom_f = FALSE;
6735 enc_idx = enc_idx == UTF_16
6736 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6737 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6739 enc_idx = enc_idx == UTF_16
6740 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6741 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6743 output_encoding = nkf_enc_from_index(enc_idx);
6747 #ifdef UTF8_INPUT_ENABLE
6748 case 'W': /* UTF input */
6751 input_encoding = nkf_enc_from_index(UTF_8);
6754 if ('1'== cp[0] && '6'==cp[1]) {
6756 input_endian = ENDIAN_BIG;
6758 } else if ('3'== cp[0] && '2'==cp[1]) {
6760 input_endian = ENDIAN_BIG;
6763 input_encoding = nkf_enc_from_index(UTF_8);
6768 input_endian = ENDIAN_LITTLE;
6769 } else if (cp[0] == 'B') {
6771 input_endian = ENDIAN_BIG;
6773 enc_idx = (enc_idx == UTF_16
6774 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6775 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6776 input_encoding = nkf_enc_from_index(enc_idx);
6780 /* Input code assumption */
6781 case 'J': /* ISO-2022-JP input */
6782 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6784 case 'E': /* EUC-JP input */
6785 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6787 case 'S': /* Shift_JIS input */
6788 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6790 case 'Z': /* Convert X0208 alphabet to asii */
6792 bit:0 Convert JIS X 0208 Alphabet to ASCII
6793 bit:1 Convert Kankaku to one space
6794 bit:2 Convert Kankaku to two spaces
6795 bit:3 Convert HTML Entity
6796 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6798 while ('0'<= *cp && *cp <='4') {
6799 alpha_f |= 1 << (*cp++ - '0');
6803 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6804 x0201_f = FALSE; /* No X0201->X0208 conversion */
6806 ESC-(-I in JIS, EUC, MS Kanji
6807 SI/SO in JIS, EUC, MS Kanji
6808 SS2 in EUC, JIS, not in MS Kanji
6809 MS Kanji (0xa0-0xdf)
6811 ESC-(-I in JIS (0x20-0x5f)
6812 SS2 in EUC (0xa0-0xdf)
6813 0xa0-0xd in MS Kanji (0xa0-0xdf)
6816 case 'X': /* Convert X0201 kana to X0208 */
6819 case 'F': /* prserve new lines */
6820 fold_preserve_f = TRUE;
6821 case 'f': /* folding -f60 or -f */
6824 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6826 fold_len += *cp++ - '0';
6828 if (!(0<fold_len && fold_len<BUFSIZ))
6829 fold_len = DEFAULT_FOLD;
6833 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6835 fold_margin += *cp++ - '0';
6839 case 'm': /* MIME support */
6840 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6841 if (*cp=='B'||*cp=='Q') {
6842 mime_decode_mode = *cp++;
6843 mimebuf_f = FIXED_MIME;
6844 } else if (*cp=='N') {
6845 mime_f = TRUE; cp++;
6846 } else if (*cp=='S') {
6847 mime_f = STRICT_MIME; cp++;
6848 } else if (*cp=='0') {
6849 mime_decode_f = FALSE;
6850 mime_f = FALSE; cp++;
6852 mime_f = STRICT_MIME;
6855 case 'M': /* MIME output */
6858 mimeout_f = FIXED_MIME; cp++;
6859 } else if (*cp=='Q') {
6861 mimeout_f = FIXED_MIME; cp++;
6866 case 'B': /* Broken JIS support */
6868 bit:1 allow any x on ESC-(-x or ESC-$-x
6869 bit:2 reset to ascii on NL
6871 if ('9'>= *cp && *cp>='0')
6872 broken_f |= 1<<(*cp++ -'0');
6877 case 'O':/* for Output file */
6881 case 'c':/* add cr code */
6884 case 'd':/* delete cr code */
6887 case 'I': /* ISO-2022-JP output */
6890 case 'L': /* line mode */
6891 if (*cp=='u') { /* unix */
6892 eolmode_f = LF; cp++;
6893 } else if (*cp=='m') { /* mac */
6894 eolmode_f = CR; cp++;
6895 } else if (*cp=='w') { /* windows */
6896 eolmode_f = CRLF; cp++;
6897 } else if (*cp=='0') { /* no conversion */
6898 eolmode_f = 0; cp++;
6903 if ('2' <= *cp && *cp <= '9') {
6906 } else if (*cp == '0' || *cp == '1') {
6915 /* module multiple options in a string are allowed for Perl module */
6916 while(*cp && *cp++!='-');
6919 #if !defined(PERL_XS) && !defined(WIN32DLL)
6920 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6922 /* bogus option but ignored */
6930 #include "nkf32dll.c"
6931 #elif defined(PERL_XS)
6932 #else /* WIN32DLL */
6934 main(int argc, char **argv)
6939 char *outfname = NULL;
6942 #ifdef EASYWIN /*Easy Win */
6943 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6945 #ifdef DEFAULT_CODE_LOCALE
6946 setlocale(LC_CTYPE, "");
6950 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6951 cp = (unsigned char *)*argv;
6956 if (pipe(fds) < 0 || (pid = fork()) < 0){
6967 execvp(argv[1], &argv[1]);
6984 int debug_f_back = debug_f;
6987 int exec_f_back = exec_f;
6990 int x0212_f_back = x0212_f;
6992 int x0213_f_back = x0213_f;
6993 int guess_f_back = guess_f;
6995 guess_f = guess_f_back;
6998 debug_f = debug_f_back;
7001 exec_f = exec_f_back;
7003 x0212_f = x0212_f_back;
7004 x0213_f = x0213_f_back;
7007 if (binmode_f == TRUE)
7008 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7009 if (freopen("","wb",stdout) == NULL)
7016 setbuf(stdout, (char *) NULL);
7018 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
7021 if (binmode_f == TRUE)
7022 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7023 if (freopen("","rb",stdin) == NULL) return (-1);
7027 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
7031 kanji_convert(stdin);
7032 if (guess_f) print_guessed_code(NULL);
7036 int is_argument_error = FALSE;
7038 input_codename = NULL;
7041 iconv_for_check = 0;
7043 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
7045 is_argument_error = TRUE;
7053 /* reopen file for stdout */
7054 if (file_out_f == TRUE) {
7057 outfname = nkf_xmalloc(strlen(origfname)
7058 + strlen(".nkftmpXXXXXX")
7060 strcpy(outfname, origfname);
7064 for (i = strlen(outfname); i; --i){
7065 if (outfname[i - 1] == '/'
7066 || outfname[i - 1] == '\\'){
7072 strcat(outfname, "ntXXXXXX");
7074 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
7075 S_IREAD | S_IWRITE);
7077 strcat(outfname, ".nkftmpXXXXXX");
7078 fd = mkstemp(outfname);
7081 || (fd_backup = dup(fileno(stdout))) < 0
7082 || dup2(fd, fileno(stdout)) < 0
7093 outfname = "nkf.out";
7096 if(freopen(outfname, "w", stdout) == NULL) {
7100 if (binmode_f == TRUE) {
7101 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7102 if (freopen("","wb",stdout) == NULL)
7109 if (binmode_f == TRUE)
7110 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7111 if (freopen("","rb",fin) == NULL)
7116 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
7120 char *filename = NULL;
7122 if (nfiles > 1) filename = origfname;
7123 if (guess_f) print_guessed_code(filename);
7129 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7137 if (dup2(fd_backup, fileno(stdout)) < 0){
7140 if (stat(origfname, &sb)) {
7141 fprintf(stderr, "Can't stat %s\n", origfname);
7143 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
7144 if (chmod(outfname, sb.st_mode)) {
7145 fprintf(stderr, "Can't set permission %s\n", outfname);
7148 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
7149 if(preserve_time_f){
7150 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7151 tb[0] = tb[1] = sb.st_mtime;
7152 if (utime(outfname, tb)) {
7153 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7156 tb.actime = sb.st_atime;
7157 tb.modtime = sb.st_mtime;
7158 if (utime(outfname, &tb)) {
7159 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7164 char *backup_filename = get_backup_filename(backup_suffix, origfname);
7166 unlink(backup_filename);
7168 if (rename(origfname, backup_filename)) {
7169 perror(backup_filename);
7170 fprintf(stderr, "Can't rename %s to %s\n",
7171 origfname, backup_filename);
7173 nkf_xfree(backup_filename);
7176 if (unlink(origfname)){
7181 if (rename(outfname, origfname)) {
7183 fprintf(stderr, "Can't rename %s to %s\n",
7184 outfname, origfname);
7186 nkf_xfree(outfname);
7191 if (is_argument_error)
7194 #ifdef EASYWIN /*Easy Win */
7195 if (file_out_f == FALSE)
7196 scanf("%d",&end_check);
7199 #else /* for Other OS */
7200 if (file_out_f == TRUE)
7202 #endif /*Easy Win */
7205 #endif /* WIN32DLL */