2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2013, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.1.4"
24 #define NKF_RELEASE_DATE "2015-12-12"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2015, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
215 {"ISO-2022-JP", ISO_2022_JP},
216 {"ISO2022JP-CP932", CP50220},
217 {"CP50220", CP50220},
218 {"CP50221", CP50221},
219 {"CSISO2022JP", CP50221},
220 {"CP50222", CP50222},
221 {"ISO-2022-JP-1", ISO_2022_JP_1},
222 {"ISO-2022-JP-3", ISO_2022_JP_3},
223 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
224 {"SHIFT_JIS", SHIFT_JIS},
226 {"MS_Kanji", SHIFT_JIS},
228 {"WINDOWS-31J", WINDOWS_31J},
229 {"CSWINDOWS31J", WINDOWS_31J},
230 {"CP932", WINDOWS_31J},
231 {"MS932", WINDOWS_31J},
232 {"CP10001", CP10001},
235 {"EUCJP-NKF", EUCJP_NKF},
236 {"CP51932", CP51932},
237 {"EUC-JP-MS", EUCJP_MS},
238 {"EUCJP-MS", EUCJP_MS},
239 {"EUCJPMS", EUCJP_MS},
240 {"EUC-JP-ASCII", EUCJP_ASCII},
241 {"EUCJP-ASCII", EUCJP_ASCII},
242 {"SHIFT_JISX0213", SHIFT_JISX0213},
243 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
244 {"EUC-JISX0213", EUC_JISX0213},
245 {"EUC-JIS-2004", EUC_JIS_2004},
248 {"UTF-8-BOM", UTF_8_BOM},
249 {"UTF8-MAC", UTF8_MAC},
250 {"UTF-8-MAC", UTF8_MAC},
252 {"UTF-16BE", UTF_16BE},
253 {"UTF-16BE-BOM", UTF_16BE_BOM},
254 {"UTF-16LE", UTF_16LE},
255 {"UTF-16LE-BOM", UTF_16LE_BOM},
257 {"UTF-32BE", UTF_32BE},
258 {"UTF-32BE-BOM", UTF_32BE_BOM},
259 {"UTF-32LE", UTF_32LE},
260 {"UTF-32LE-BOM", UTF_32LE_BOM},
265 #if defined(DEFAULT_CODE_JIS)
266 #define DEFAULT_ENCIDX ISO_2022_JP
267 #elif defined(DEFAULT_CODE_SJIS)
268 #define DEFAULT_ENCIDX SHIFT_JIS
269 #elif defined(DEFAULT_CODE_WINDOWS_31J)
270 #define DEFAULT_ENCIDX WINDOWS_31J
271 #elif defined(DEFAULT_CODE_EUC)
272 #define DEFAULT_ENCIDX EUC_JP
273 #elif defined(DEFAULT_CODE_UTF8)
274 #define DEFAULT_ENCIDX UTF_8
278 #define is_alnum(c) \
279 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
281 /* I don't trust portablity of toupper */
282 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
283 #define nkf_isoctal(c) ('0'<=c && c<='7')
284 #define nkf_isdigit(c) ('0'<=c && c<='9')
285 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
286 #define nkf_isblank(c) (c == SP || c == TAB)
287 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
288 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
289 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
290 #define nkf_isprint(c) (SP<=c && c<='~')
291 #define nkf_isgraph(c) ('!'<=c && c<='~')
292 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
293 ('A'<=c&&c<='F') ? (c-'A'+10) : \
294 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
295 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
296 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
297 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
298 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
299 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
301 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
302 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
304 #define HOLD_SIZE 1024
305 #if defined(INT_IS_SHORT)
306 #define IOBUF_SIZE 2048
308 #define IOBUF_SIZE 16384
311 #define DEFAULT_J 'B'
312 #define DEFAULT_R 'B'
319 /* MIME preprocessor */
321 #ifdef EASYWIN /*Easy Win */
322 extern POINT _BufferSize;
331 void (*status_func)(struct input_code *, nkf_char);
332 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
336 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
337 static nkf_encoding *input_encoding = NULL;
338 static nkf_encoding *output_encoding = NULL;
340 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
342 * 0: Shift_JIS, eucJP-ascii
347 #define UCS_MAP_ASCII 0
349 #define UCS_MAP_CP932 2
350 #define UCS_MAP_CP10001 3
351 static int ms_ucs_map_f = UCS_MAP_ASCII;
353 #ifdef UTF8_INPUT_ENABLE
354 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
355 static int no_cp932ext_f = FALSE;
356 /* ignore ZERO WIDTH NO-BREAK SPACE */
357 static int no_best_fit_chars_f = FALSE;
358 static int input_endian = ENDIAN_BIG;
359 static int input_bom_f = FALSE;
360 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
361 static void (*encode_fallback)(nkf_char c) = NULL;
362 static void w_status(struct input_code *, nkf_char);
364 #ifdef UTF8_OUTPUT_ENABLE
365 static int output_bom_f = FALSE;
366 static int output_endian = ENDIAN_BIG;
369 static void std_putc(nkf_char c);
370 static nkf_char std_getc(FILE *f);
371 static nkf_char std_ungetc(nkf_char c,FILE *f);
373 static nkf_char broken_getc(FILE *f);
374 static nkf_char broken_ungetc(nkf_char c,FILE *f);
376 static nkf_char mime_getc(FILE *f);
378 static void mime_putc(nkf_char c);
382 #if !defined(PERL_XS) && !defined(WIN32DLL)
383 static unsigned char stdibuf[IOBUF_SIZE];
384 static unsigned char stdobuf[IOBUF_SIZE];
387 #define NKF_UNSPECIFIED (-TRUE)
390 static int unbuf_f = FALSE;
391 static int estab_f = FALSE;
392 static int nop_f = FALSE;
393 static int binmode_f = TRUE; /* binary mode */
394 static int rot_f = FALSE; /* rot14/43 mode */
395 static int hira_f = FALSE; /* hira/kata henkan */
396 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
397 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
398 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
399 static int mimebuf_f = FALSE; /* MIME buffered input */
400 static int broken_f = FALSE; /* convert ESC-less broken JIS */
401 static int iso8859_f = FALSE; /* ISO8859 through */
402 static int mimeout_f = FALSE; /* base64 mode */
403 static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */
404 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
406 #ifdef UNICODE_NORMALIZATION
407 static int nfc_f = FALSE;
408 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
409 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
413 static int cap_f = FALSE;
414 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
415 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
417 static int url_f = FALSE;
418 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
419 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
422 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
423 #define CLASS_MASK NKF_INT32_C(0xFF000000)
424 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
425 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
426 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
427 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
428 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
429 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
430 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
431 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
432 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
434 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
436 #ifdef NUMCHAR_OPTION
437 static int numchar_f = FALSE;
438 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
439 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
443 static int noout_f = FALSE;
444 static void no_putc(nkf_char c);
445 static int debug_f = FALSE;
446 static void debug(const char *str);
447 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
450 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
451 static void set_input_codename(const char *codename);
454 static int exec_f = 0;
457 #ifdef SHIFTJIS_CP932
458 /* invert IBM extended characters to others */
459 static int cp51932_f = FALSE;
461 /* invert NEC-selected IBM extended characters to IBM extended characters */
462 static int cp932inv_f = TRUE;
464 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
465 #endif /* SHIFTJIS_CP932 */
467 static int x0212_f = FALSE;
468 static int x0213_f = FALSE;
470 static unsigned char prefix_table[256];
472 static void e_status(struct input_code *, nkf_char);
473 static void s_status(struct input_code *, nkf_char);
475 struct input_code input_code_list[] = {
476 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
477 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
478 #ifdef UTF8_INPUT_ENABLE
479 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
480 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
481 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
483 {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0}
486 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
487 static int base64_count = 0;
489 /* X0208 -> ASCII converter */
492 static int f_line = 0; /* chars in line */
493 static int f_prev = 0;
494 static int fold_preserve_f = FALSE; /* preserve new lines */
495 static int fold_f = FALSE;
496 static int fold_len = 0;
499 static unsigned char kanji_intro = DEFAULT_J;
500 static unsigned char ascii_intro = DEFAULT_R;
504 #define FOLD_MARGIN 10
505 #define DEFAULT_FOLD 60
507 static int fold_margin = FOLD_MARGIN;
509 /* process default */
512 no_connection2(ARG_UNUSED nkf_char c2, ARG_UNUSED nkf_char c1, ARG_UNUSED nkf_char c0)
514 fprintf(stderr,"nkf internal module connection failure.\n");
520 no_connection(nkf_char c2, nkf_char c1)
522 no_connection2(c2,c1,0);
525 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
526 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
528 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
529 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
530 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
531 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
532 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
533 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
534 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
536 /* static redirections */
538 static void (*o_putc)(nkf_char c) = std_putc;
540 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
541 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
543 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
544 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
546 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
548 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
549 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
551 /* for strict mime */
552 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
553 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
556 static int output_mode = ASCII; /* output kanji mode */
557 static int input_mode = ASCII; /* input kanji mode */
558 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
560 /* X0201 / X0208 conversion tables */
562 /* X0201 kana conversion table */
564 static const unsigned char cv[]= {
565 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
566 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
567 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
568 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
569 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
570 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
571 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
572 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
573 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
574 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
575 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
576 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
577 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
578 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
579 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
580 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
584 /* X0201 kana conversion table for daguten */
586 static const unsigned char dv[]= {
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
592 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
593 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
594 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
595 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
596 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
598 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 /* X0201 kana conversion table for han-daguten */
607 static const unsigned char ev[]= {
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
618 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
619 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
626 /* X0201 kana to X0213 conversion table for han-daguten */
628 static const unsigned char ev_x0213[]= {
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
634 0x00,0x00,0x00,0x00,0x25,0x77,0x25,0x78,
635 0x25,0x79,0x25,0x7a,0x25,0x7b,0x00,0x00,
636 0x00,0x00,0x00,0x00,0x25,0x7c,0x00,0x00,
637 0x00,0x00,0x00,0x00,0x25,0x7d,0x00,0x00,
638 0x25,0x7e,0x00,0x00,0x00,0x00,0x00,0x00,
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
648 /* X0208 kigou conversion table */
649 /* 0x8140 - 0x819e */
650 static const unsigned char fv[] = {
652 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
653 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
654 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
656 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
657 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
658 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
660 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
668 static int option_mode = 0;
669 static int file_out_f = FALSE;
671 static int overwrite_f = FALSE;
672 static int preserve_time_f = FALSE;
673 static int backup_f = FALSE;
674 static char *backup_suffix = "";
677 static int eolmode_f = 0; /* CR, LF, CRLF */
678 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
679 static nkf_char prev_cr = 0; /* CR or 0 */
680 #ifdef EASYWIN /*Easy Win */
681 static int end_check;
685 nkf_xmalloc(size_t size)
689 if (size == 0) size = 1;
693 perror("can't malloc");
701 nkf_xrealloc(void *ptr, size_t size)
703 if (size == 0) size = 1;
705 ptr = realloc(ptr, size);
707 perror("can't realloc");
714 #define nkf_xfree(ptr) free(ptr)
717 nkf_str_caseeql(const char *src, const char *target)
720 for (i = 0; src[i] && target[i]; i++) {
721 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
723 if (src[i] || target[i]) return FALSE;
728 nkf_enc_from_index(int idx)
730 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
733 return &nkf_encoding_table[idx];
737 nkf_enc_find_index(const char *name)
740 if (name[0] == 'X' && *(name+1) == '-') name += 2;
741 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
742 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
743 return encoding_name_to_id_table[i].id;
750 nkf_enc_find(const char *name)
753 idx = nkf_enc_find_index(name);
754 if (idx < 0) return 0;
755 return nkf_enc_from_index(idx);
758 #define nkf_enc_name(enc) (enc)->name
759 #define nkf_enc_to_index(enc) (enc)->id
760 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
761 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
762 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
763 #define nkf_enc_asciicompat(enc) (\
764 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
765 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
766 #define nkf_enc_unicode_p(enc) (\
767 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
768 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
769 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
770 #define nkf_enc_cp5022x_p(enc) (\
771 nkf_enc_to_index(enc) == CP50220 ||\
772 nkf_enc_to_index(enc) == CP50221 ||\
773 nkf_enc_to_index(enc) == CP50222)
775 #ifdef DEFAULT_CODE_LOCALE
777 nkf_locale_charmap(void)
779 #ifdef HAVE_LANGINFO_H
780 return nl_langinfo(CODESET);
781 #elif defined(__WIN32__)
783 sprintf(buf, "CP%d", GetACP());
785 #elif defined(__OS2__)
786 # if defined(INT_IS_SHORT)
792 ULONG ulCP[1], ulncp;
793 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
794 if (ulCP[0] == 932 || ulCP[0] == 943)
795 strcpy(buf, "Shift_JIS");
797 sprintf(buf, "CP%lu", ulCP[0]);
805 nkf_locale_encoding(void)
807 nkf_encoding *enc = 0;
808 const char *encname = nkf_locale_charmap();
810 enc = nkf_enc_find(encname);
813 #endif /* DEFAULT_CODE_LOCALE */
816 nkf_utf8_encoding(void)
818 return &nkf_encoding_table[UTF_8];
822 nkf_default_encoding(void)
824 nkf_encoding *enc = 0;
825 #ifdef DEFAULT_CODE_LOCALE
826 enc = nkf_locale_encoding();
827 #elif defined(DEFAULT_ENCIDX)
828 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
830 if (!enc) enc = nkf_utf8_encoding();
841 nkf_buf_new(int length)
843 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
844 buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length);
852 nkf_buf_dispose(nkf_buf_t *buf)
859 #define nkf_buf_length(buf) ((buf)->len)
860 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
863 nkf_buf_at(nkf_buf_t *buf, int index)
865 assert(index <= buf->len);
866 return buf->ptr[index];
870 nkf_buf_clear(nkf_buf_t *buf)
876 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
878 if (buf->capa <= buf->len) {
881 buf->ptr[buf->len++] = c;
885 nkf_buf_pop(nkf_buf_t *buf)
887 assert(!nkf_buf_empty_p(buf));
888 return buf->ptr[--buf->len];
891 /* Normalization Form C */
894 #define fprintf dllprintf
900 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
907 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
908 #ifdef UTF8_OUTPUT_ENABLE
909 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
910 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
913 #ifdef UTF8_INPUT_ENABLE
914 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
915 " UTF option is -W[8,[16,32][B,L]]\n"
917 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
921 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
922 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
923 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
926 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
927 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
928 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
929 " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n"
932 " O Output to File (DEFAULT 'nkf.out')\n"
933 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
936 " --ic=<encoding> Specify the input encoding\n"
937 " --oc=<encoding> Specify the output encoding\n"
938 " --hiragana --katakana Hiragana/Katakana Conversion\n"
939 " --katakana-hiragana Converts each other\n"
943 " --{cap, url}-input Convert hex after ':' or '%%'\n"
945 #ifdef NUMCHAR_OPTION
946 " --numchar-input Convert Unicode Character Reference\n"
948 #ifdef UTF8_INPUT_ENABLE
949 " --fb-{skip, html, xml, perl, java, subchar}\n"
950 " Specify unassigned character's replacement\n"
955 " --in-place[=SUF] Overwrite original files\n"
956 " --overwrite[=SUF] Preserve timestamp of original files\n"
958 " -g --guess Guess the input code\n"
959 " -v --version Print the version\n"
960 " --help/-V Print this help / configuration\n"
966 show_configuration(void)
969 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
970 " Compile-time options:\n"
971 " Compiled at: " __DATE__ " " __TIME__ "\n"
974 " Default output encoding: "
975 #ifdef DEFAULT_CODE_LOCALE
976 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
977 #elif defined(DEFAULT_ENCIDX)
978 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
984 " Default output end of line: "
985 #if DEFAULT_NEWLINE == CR
987 #elif DEFAULT_NEWLINE == CRLF
993 " Decode MIME encoded string: "
994 #if MIME_DECODE_DEFAULT
1000 " Convert JIS X 0201 Katakana: "
1007 " --help, --version output: "
1008 #if HELP_OUTPUT_HELP_OUTPUT
1019 get_backup_filename(const char *suffix, const char *filename)
1021 char *backup_filename;
1022 int asterisk_count = 0;
1024 int filename_length = strlen(filename);
1026 for(i = 0; suffix[i]; i++){
1027 if(suffix[i] == '*') asterisk_count++;
1031 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1032 for(i = 0, j = 0; suffix[i];){
1033 if(suffix[i] == '*'){
1034 backup_filename[j] = '\0';
1035 strncat(backup_filename, filename, filename_length);
1037 j += filename_length;
1039 backup_filename[j++] = suffix[i++];
1042 backup_filename[j] = '\0';
1044 j = filename_length + strlen(suffix);
1045 backup_filename = nkf_xmalloc(j + 1);
1046 strcpy(backup_filename, filename);
1047 strcat(backup_filename, suffix);
1048 backup_filename[j] = '\0';
1050 return backup_filename;
1054 #ifdef UTF8_INPUT_ENABLE
1056 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1061 if(c >= NKF_INT32_C(1)<<shift){
1063 (*f)(0, bin2hex(c>>shift));
1074 encode_fallback_html(nkf_char c)
1079 if(c >= NKF_INT32_C(1000000))
1080 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1081 if(c >= NKF_INT32_C(100000))
1082 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1084 (*oconv)(0, 0x30+(c/10000 )%10);
1086 (*oconv)(0, 0x30+(c/1000 )%10);
1088 (*oconv)(0, 0x30+(c/100 )%10);
1090 (*oconv)(0, 0x30+(c/10 )%10);
1092 (*oconv)(0, 0x30+ c %10);
1098 encode_fallback_xml(nkf_char c)
1103 nkf_each_char_to_hex(oconv, c);
1109 encode_fallback_java(nkf_char c)
1113 if(!nkf_char_unicode_bmp_p(c)){
1117 (*oconv)(0, bin2hex(c>>20));
1118 (*oconv)(0, bin2hex(c>>16));
1122 (*oconv)(0, bin2hex(c>>12));
1123 (*oconv)(0, bin2hex(c>> 8));
1124 (*oconv)(0, bin2hex(c>> 4));
1125 (*oconv)(0, bin2hex(c ));
1130 encode_fallback_perl(nkf_char c)
1135 nkf_each_char_to_hex(oconv, c);
1141 encode_fallback_subchar(nkf_char c)
1143 c = unicode_subchar;
1144 (*oconv)((c>>8)&0xFF, c&0xFF);
1149 static const struct {
1173 {"katakana-hiragana","h3"},
1181 #ifdef UTF8_OUTPUT_ENABLE
1191 {"fb-subchar=", ""},
1193 #ifdef UTF8_INPUT_ENABLE
1194 {"utf8-input", "W"},
1195 {"utf16-input", "W16"},
1196 {"no-cp932ext", ""},
1197 {"no-best-fit-chars",""},
1199 #ifdef UNICODE_NORMALIZATION
1200 {"utf8mac-input", ""},
1212 #ifdef NUMCHAR_OPTION
1213 {"numchar-input", ""},
1219 #ifdef SHIFTJIS_CP932
1230 set_input_encoding(nkf_encoding *enc)
1232 switch (nkf_enc_to_index(enc)) {
1238 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1240 #ifdef SHIFTJIS_CP932
1243 #ifdef UTF8_OUTPUT_ENABLE
1244 ms_ucs_map_f = UCS_MAP_CP932;
1254 case ISO_2022_JP_2004:
1261 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1262 #ifdef SHIFTJIS_CP932
1265 #ifdef UTF8_OUTPUT_ENABLE
1266 ms_ucs_map_f = UCS_MAP_CP932;
1271 #ifdef SHIFTJIS_CP932
1274 #ifdef UTF8_OUTPUT_ENABLE
1275 ms_ucs_map_f = UCS_MAP_CP10001;
1283 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1284 #ifdef SHIFTJIS_CP932
1287 #ifdef UTF8_OUTPUT_ENABLE
1288 ms_ucs_map_f = UCS_MAP_CP932;
1292 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1293 #ifdef SHIFTJIS_CP932
1296 #ifdef UTF8_OUTPUT_ENABLE
1297 ms_ucs_map_f = UCS_MAP_MS;
1301 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1302 #ifdef SHIFTJIS_CP932
1305 #ifdef UTF8_OUTPUT_ENABLE
1306 ms_ucs_map_f = UCS_MAP_ASCII;
1309 case SHIFT_JISX0213:
1310 case SHIFT_JIS_2004:
1312 #ifdef SHIFTJIS_CP932
1314 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1320 #ifdef SHIFTJIS_CP932
1324 #ifdef UTF8_INPUT_ENABLE
1325 #ifdef UNICODE_NORMALIZATION
1333 input_endian = ENDIAN_BIG;
1337 input_endian = ENDIAN_LITTLE;
1342 input_endian = ENDIAN_BIG;
1346 input_endian = ENDIAN_LITTLE;
1353 set_output_encoding(nkf_encoding *enc)
1355 switch (nkf_enc_to_index(enc)) {
1357 #ifdef SHIFTJIS_CP932
1358 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1360 #ifdef UTF8_OUTPUT_ENABLE
1361 ms_ucs_map_f = UCS_MAP_CP932;
1365 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1366 #ifdef SHIFTJIS_CP932
1367 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1369 #ifdef UTF8_OUTPUT_ENABLE
1370 ms_ucs_map_f = UCS_MAP_CP932;
1374 #ifdef SHIFTJIS_CP932
1375 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1380 #ifdef SHIFTJIS_CP932
1381 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1385 case ISO_2022_JP_2004:
1388 #ifdef SHIFTJIS_CP932
1389 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1395 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1396 #ifdef UTF8_OUTPUT_ENABLE
1397 ms_ucs_map_f = UCS_MAP_CP932;
1401 #ifdef UTF8_OUTPUT_ENABLE
1402 ms_ucs_map_f = UCS_MAP_CP10001;
1407 #ifdef SHIFTJIS_CP932
1408 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1410 #ifdef UTF8_OUTPUT_ENABLE
1411 ms_ucs_map_f = UCS_MAP_ASCII;
1416 #ifdef SHIFTJIS_CP932
1417 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1419 #ifdef UTF8_OUTPUT_ENABLE
1420 ms_ucs_map_f = UCS_MAP_ASCII;
1424 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1425 #ifdef SHIFTJIS_CP932
1426 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1428 #ifdef UTF8_OUTPUT_ENABLE
1429 ms_ucs_map_f = UCS_MAP_CP932;
1433 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1435 #ifdef UTF8_OUTPUT_ENABLE
1436 ms_ucs_map_f = UCS_MAP_MS;
1440 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1442 #ifdef UTF8_OUTPUT_ENABLE
1443 ms_ucs_map_f = UCS_MAP_ASCII;
1446 case SHIFT_JISX0213:
1447 case SHIFT_JIS_2004:
1449 #ifdef SHIFTJIS_CP932
1450 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1457 #ifdef SHIFTJIS_CP932
1458 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1461 #ifdef UTF8_OUTPUT_ENABLE
1463 output_bom_f = TRUE;
1467 output_bom_f = TRUE;
1470 output_endian = ENDIAN_LITTLE;
1471 output_bom_f = FALSE;
1474 output_endian = ENDIAN_LITTLE;
1475 output_bom_f = TRUE;
1479 output_bom_f = TRUE;
1482 output_endian = ENDIAN_LITTLE;
1483 output_bom_f = FALSE;
1486 output_endian = ENDIAN_LITTLE;
1487 output_bom_f = TRUE;
1493 static struct input_code*
1494 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1497 struct input_code *p = input_code_list;
1499 if (iconv_func == p->iconv_func){
1509 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1511 #ifdef INPUT_CODE_FIX
1512 if (f || !input_encoding)
1519 #ifdef INPUT_CODE_FIX
1520 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1526 if (estab_f && iconv_for_check != iconv){
1527 struct input_code *p = find_inputcode_byfunc(iconv);
1529 set_input_codename(p->name);
1532 iconv_for_check = iconv;
1539 x0212_shift(nkf_char c)
1544 if (0x75 <= c && c <= 0x7f){
1545 ret = c + (0x109 - 0x75);
1548 if (0x75 <= c && c <= 0x7f){
1549 ret = c + (0x113 - 0x75);
1557 x0212_unshift(nkf_char c)
1560 if (0x7f <= c && c <= 0x88){
1561 ret = c + (0x75 - 0x7f);
1562 }else if (0x89 <= c && c <= 0x92){
1563 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1567 #endif /* X0212_ENABLE */
1570 is_x0213_2_in_x0212(nkf_char c1)
1572 static const char x0213_2_table[] =
1573 {0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1};
1576 return x0213_2_table[ku]; /* 1, 3-5, 8, 12-15 */
1577 if (78 <= ku && ku <= 94)
1583 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1588 if (x0213_f && is_x0213_2_in_x0212(ndx)){
1589 if((0x21 <= ndx && ndx <= 0x2F)){
1590 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1591 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1593 }else if(0x6E <= ndx && ndx <= 0x7E){
1594 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1595 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1601 else if(nkf_isgraph(ndx)){
1603 const unsigned short *ptr;
1604 ptr = x0212_shiftjis[ndx - 0x21];
1606 val = ptr[(c1 & 0x7f) - 0x21];
1615 c2 = x0212_shift(c2);
1617 #endif /* X0212_ENABLE */
1619 if(0x7F < c2) return 1;
1620 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1621 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1626 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1628 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1631 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1632 if (0xFC < c1) return 1;
1633 #ifdef SHIFTJIS_CP932
1634 if (!cp932inv_f && !x0213_f && is_ibmext_in_sjis(c2)){
1635 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1642 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1643 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1649 #endif /* SHIFTJIS_CP932 */
1651 if (!x0213_f && is_ibmext_in_sjis(c2)){
1652 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1655 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1668 if(x0213_f && c2 >= 0xF0){
1669 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1670 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1671 }else{ /* 78<=k<=94 */
1672 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1673 if (0x9E < c1) c2++;
1676 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1677 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1678 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1679 if (0x9E < c1) c2++;
1682 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1689 c2 = x0212_unshift(c2);
1696 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1698 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1706 }else if (val < 0x800){
1707 *p1 = 0xc0 | (val >> 6);
1708 *p2 = 0x80 | (val & 0x3f);
1711 } else if (nkf_char_unicode_bmp_p(val)) {
1712 *p1 = 0xe0 | (val >> 12);
1713 *p2 = 0x80 | ((val >> 6) & 0x3f);
1714 *p3 = 0x80 | ( val & 0x3f);
1716 } else if (nkf_char_unicode_value_p(val)) {
1717 *p1 = 0xf0 | (val >> 18);
1718 *p2 = 0x80 | ((val >> 12) & 0x3f);
1719 *p3 = 0x80 | ((val >> 6) & 0x3f);
1720 *p4 = 0x80 | ( val & 0x3f);
1730 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1737 else if (c1 <= 0xC1) {
1738 /* trail byte or invalid */
1741 else if (c1 <= 0xDF) {
1743 wc = (c1 & 0x1F) << 6;
1746 else if (c1 <= 0xEF) {
1748 wc = (c1 & 0x0F) << 12;
1749 wc |= (c2 & 0x3F) << 6;
1752 else if (c2 <= 0xF4) {
1754 wc = (c1 & 0x0F) << 18;
1755 wc |= (c2 & 0x3F) << 12;
1756 wc |= (c3 & 0x3F) << 6;
1766 #ifdef UTF8_INPUT_ENABLE
1768 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1769 const unsigned short *const *pp, nkf_char psize,
1770 nkf_char *p2, nkf_char *p1)
1773 const unsigned short *p;
1776 if (pp == 0) return 1;
1779 if (c1 < 0 || psize <= c1) return 1;
1781 if (p == 0) return 1;
1784 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1786 if (val == 0) return 1;
1787 if (no_cp932ext_f && (
1788 (val>>8) == 0x2D || /* NEC special characters */
1789 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1797 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1805 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1807 const unsigned short *const *pp;
1808 const unsigned short *const *const *ppp;
1809 static const char no_best_fit_chars_table_C2[] =
1810 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1811 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1812 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1813 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1814 static const char no_best_fit_chars_table_C2_ms[] =
1815 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1816 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1817 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1818 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1819 static const char no_best_fit_chars_table_932_C2[] =
1820 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1821 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1822 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1823 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1824 static const char no_best_fit_chars_table_932_C3[] =
1825 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1826 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1827 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1828 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1834 }else if(c2 < 0xe0){
1835 if(no_best_fit_chars_f){
1836 if(ms_ucs_map_f == UCS_MAP_CP932){
1839 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1842 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1845 }else if(!cp932inv_f){
1848 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1851 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1854 }else if(ms_ucs_map_f == UCS_MAP_MS){
1855 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1856 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1874 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1875 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1876 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1877 x0213_f ? utf8_to_euc_2bytes_x0213 :
1879 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1880 }else if(c0 < 0xF0){
1881 if(no_best_fit_chars_f){
1882 if(ms_ucs_map_f == UCS_MAP_CP932){
1883 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1884 }else if(ms_ucs_map_f == UCS_MAP_MS){
1889 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1892 if(c0 == 0x92) return 1;
1897 if(c1 == 0x80 || c0 == 0x9C) return 1;
1900 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1905 if(c0 == 0x94) return 1;
1908 if(c0 == 0xBB) return 1;
1918 if(c0 == 0x95) return 1;
1921 if(c0 == 0xA5) return 1;
1928 if(c0 == 0x8D) return 1;
1931 if(c0 == 0x9E && !cp932inv_f) return 1;
1934 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1942 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1943 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1944 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1945 x0213_f ? utf8_to_euc_3bytes_x0213 :
1947 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1949 #ifdef SHIFTJIS_CP932
1950 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1952 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1953 s2e_conv(s2, s1, p2, p1);
1962 #ifdef UTF8_OUTPUT_ENABLE
1963 #define X0213_SURROGATE_FIND(tbl, size, euc) do { \
1965 for (i = 0; i < size; i++) \
1966 if (tbl[i][0] == euc) { \
1973 e2w_conv(nkf_char c2, nkf_char c1)
1975 const unsigned short *p;
1977 if (c2 == JIS_X_0201_1976_K) {
1978 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1986 p = euc_to_utf8_1byte;
1988 } else if (is_eucg3(c2)){
1989 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1992 c2 = (c2&0x7f) - 0x21;
1993 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1995 x0213_f ? x0212_to_utf8_2bytes_x0213[c2] :
1996 x0212_to_utf8_2bytes[c2];
2002 c2 = (c2&0x7f) - 0x21;
2003 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2005 x0213_f ? euc_to_utf8_2bytes_x0213[c2] :
2006 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
2007 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
2008 euc_to_utf8_2bytes_ms[c2];
2013 c1 = (c1 & 0x7f) - 0x21;
2014 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte) {
2015 nkf_char val = p[c1];
2016 if (x0213_f && 0xD800<=val && val<=0xDBFF) {
2017 nkf_char euc = (c2+0x21)<<8 | (c1+0x21);
2019 if (p==x0212_to_utf8_2bytes_x0213[c2]) {
2020 X0213_SURROGATE_FIND(x0213_2_surrogate_table, sizeof_x0213_2_surrogate_table, euc);
2022 X0213_SURROGATE_FIND(x0213_1_surrogate_table, sizeof_x0213_1_surrogate_table, euc);
2025 return UTF16_TO_UTF32(val, low);
2034 e2w_combining(nkf_char comb, nkf_char c2, nkf_char c1)
2038 for (i = 0; i < sizeof_x0213_combining_chars; i++)
2039 if (x0213_combining_chars[i] == comb)
2041 if (i >= sizeof_x0213_combining_chars)
2043 euc = (c2&0x7f)<<8 | (c1&0x7f);
2044 for (i = 0; i < sizeof_x0213_combining_table; i++)
2045 if (x0213_combining_table[i][0] == euc)
2046 return x0213_combining_table[i][1];
2052 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
2059 }else if (0xc0 <= c2 && c2 <= 0xef) {
2060 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2061 #ifdef NUMCHAR_OPTION
2064 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
2072 #ifdef UTF8_INPUT_ENABLE
2074 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
2076 nkf_char c1, c2, c3, c4;
2083 else if (nkf_char_unicode_bmp_p(val)){
2084 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2085 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
2088 *p1 = nkf_char_unicode_new(val);
2095 c1 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2096 c2 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2097 for (i = 0; i < sizeof_x0213_1_surrogate_table; i++)
2098 if (x0213_1_surrogate_table[i][1] == c1 && x0213_1_surrogate_table[i][2] == c2) {
2099 val = x0213_1_surrogate_table[i][0];
2104 for (i = 0; i < sizeof_x0213_2_surrogate_table; i++)
2105 if (x0213_2_surrogate_table[i][1] == c1 && x0213_2_surrogate_table[i][2] == c2) {
2106 val = x0213_2_surrogate_table[i][0];
2107 *p2 = PREFIX_EUCG3 | (val >> 8);
2113 *p1 = nkf_char_unicode_new(val);
2120 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2122 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2123 if (iso2022jp_f && !x0201_f) {
2124 c2 = GETA1; c1 = GETA2;
2126 c2 = JIS_X_0201_1976_K;
2130 }else if (c2 == 0x8f){
2134 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2135 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2136 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2139 c2 = (c2 << 8) | (c1 & 0x7f);
2141 #ifdef SHIFTJIS_CP932
2144 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2145 s2e_conv(s2, s1, &c2, &c1);
2152 #endif /* SHIFTJIS_CP932 */
2154 #endif /* X0212_ENABLE */
2155 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2158 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2159 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2160 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2165 #ifdef SHIFTJIS_CP932
2166 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2168 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2169 s2e_conv(s2, s1, &c2, &c1);
2176 #endif /* SHIFTJIS_CP932 */
2184 s_iconv(ARG_UNUSED nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2186 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2187 if (iso2022jp_f && !x0201_f) {
2188 c2 = GETA1; c1 = GETA2;
2192 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2194 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2196 if(c1 == 0x7F) return 0;
2197 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2200 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2201 if (ret) return ret;
2208 x0213_wait_combining_p(nkf_char wc)
2211 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2212 if (x0213_combining_table[i][1] == wc) {
2220 x0213_combining_p(nkf_char wc)
2223 for (i = 0; i < sizeof_x0213_combining_chars; i++) {
2224 if (x0213_combining_chars[i] == wc) {
2232 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2234 nkf_char ret = 0, c4 = 0;
2235 static const char w_iconv_utf8_1st_byte[] =
2237 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2238 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2239 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2240 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2247 if (c1 < 0 || 0xff < c1) {
2248 }else if (c1 == 0) { /* 0 : 1 byte*/
2250 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2253 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2255 if (c2 < 0x80 || 0xBF < c2) return 0;
2258 if (c3 == 0) return -1;
2259 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2264 if (c3 == 0) return -1;
2265 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2269 if (c3 == 0) return -1;
2270 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2274 if (c3 == 0) return -2;
2275 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2279 if (c3 == 0) return -2;
2280 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2284 if (c3 == 0) return -2;
2285 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2293 if (c1 == 0 || c1 == EOF){
2294 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2295 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2298 if (x0213_f && x0213_wait_combining_p(nkf_utf8_to_unicode(c1, c2, c3, c4)))
2300 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2309 w_iconv_nocombine(nkf_char c1, nkf_char c2, nkf_char c3)
2311 /* continue from the line below 'return -3;' in w_iconv() */
2312 nkf_char ret = w2e_conv(c1, c2, c3, &c1, &c2);
2319 #define NKF_ICONV_INVALID_CODE_RANGE -13
2320 #define NKF_ICONV_WAIT_COMBINING_CHAR -14
2321 #define NKF_ICONV_NOT_COMBINED -15
2323 unicode_iconv(nkf_char wc, int nocombine)
2331 }else if ((wc>>11) == 27) {
2332 /* unpaired surrogate */
2333 return NKF_ICONV_INVALID_CODE_RANGE;
2334 }else if (wc < 0xFFFF) {
2335 if (!nocombine && x0213_f && x0213_wait_combining_p(wc))
2336 return NKF_ICONV_WAIT_COMBINING_CHAR;
2337 ret = w16e_conv(wc, &c2, &c1);
2338 if (ret) return ret;
2339 }else if (wc < 0x10FFFF) {
2341 c1 = nkf_char_unicode_new(wc);
2343 return NKF_ICONV_INVALID_CODE_RANGE;
2350 unicode_iconv_combine(nkf_char wc, nkf_char wc2)
2356 return NKF_ICONV_NOT_COMBINED;
2357 }else if ((wc2>>11) == 27) {
2358 /* unpaired surrogate */
2359 return NKF_ICONV_INVALID_CODE_RANGE;
2360 }else if (wc2 < 0xFFFF) {
2361 if (!x0213_combining_p(wc2))
2362 return NKF_ICONV_NOT_COMBINED;
2363 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2364 if (x0213_combining_table[i][1] == wc &&
2365 x0213_combining_table[i][2] == wc2) {
2366 c2 = x0213_combining_table[i][0] >> 8;
2367 c1 = x0213_combining_table[i][0] & 0x7f;
2372 }else if (wc2 < 0x10FFFF) {
2373 return NKF_ICONV_NOT_COMBINED;
2375 return NKF_ICONV_INVALID_CODE_RANGE;
2377 return NKF_ICONV_NOT_COMBINED;
2381 w_iconv_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6)
2384 wc = nkf_utf8_to_unicode(c1, c2, c3, 0);
2385 wc2 = nkf_utf8_to_unicode(c4, c5, c6, 0);
2388 return unicode_iconv_combine(wc, wc2);
2391 #define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1
2392 #define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2
2394 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2403 if (input_endian == ENDIAN_BIG) {
2404 if (0xD8 <= c1 && c1 <= 0xDB) {
2405 if (0xDC <= c3 && c3 <= 0xDF) {
2406 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2407 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2412 if (0xD8 <= c2 && c2 <= 0xDB) {
2413 if (0xDC <= c4 && c4 <= 0xDF) {
2414 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2415 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2421 return (*unicode_iconv)(wc, FALSE);
2425 nkf_iconv_utf_16_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2429 if (input_endian == ENDIAN_BIG) {
2430 if (0xD8 <= c3 && c3 <= 0xDB) {
2431 return NKF_ICONV_NOT_COMBINED;
2437 if (0xD8 <= c2 && c2 <= 0xDB) {
2438 return NKF_ICONV_NOT_COMBINED;
2445 return unicode_iconv_combine(wc, wc2);
2449 nkf_iconv_utf_16_nocombine(nkf_char c1, nkf_char c2)
2452 if (input_endian == ENDIAN_BIG)
2456 return (*unicode_iconv)(wc, TRUE);
2460 w_iconv16(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2463 return 16; /* different from w_iconv32 */
2467 w_iconv32(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2470 return 32; /* different from w_iconv16 */
2474 utf32_to_nkf_char(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2478 switch(input_endian){
2480 wc = c2 << 16 | c3 << 8 | c4;
2483 wc = c3 << 16 | c2 << 8 | c1;
2486 wc = c1 << 16 | c4 << 8 | c3;
2489 wc = c4 << 16 | c1 << 8 | c2;
2492 return NKF_ICONV_INVALID_CODE_RANGE;
2498 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2507 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2511 return (*unicode_iconv)(wc, FALSE);
2515 nkf_iconv_utf_32_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6, nkf_char c7, nkf_char c8)
2519 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2522 wc2 = utf32_to_nkf_char(c5, c6, c7, c8);
2526 return unicode_iconv_combine(wc, wc2);
2530 nkf_iconv_utf_32_nocombine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2534 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2535 return (*unicode_iconv)(wc, TRUE);
2539 #define output_ascii_escape_sequence(mode) do { \
2540 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2543 (*o_putc)(ascii_intro); \
2544 output_mode = mode; \
2549 output_escape_sequence(int mode)
2551 if (output_mode == mode)
2559 case JIS_X_0201_1976_K:
2567 (*o_putc)(kanji_intro);
2592 j_oconv(nkf_char c2, nkf_char c1)
2594 #ifdef NUMCHAR_OPTION
2595 if (c2 == 0 && nkf_char_unicode_p(c1)){
2596 w16e_conv(c1, &c2, &c1);
2597 if (c2 == 0 && nkf_char_unicode_p(c1)){
2598 c2 = c1 & VALUE_MASK;
2599 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2602 c2 = 0x7F + c1 / 94;
2603 c1 = 0x21 + c1 % 94;
2605 if (encode_fallback) (*encode_fallback)(c1);
2612 output_ascii_escape_sequence(ASCII);
2615 else if (c2 == EOF) {
2616 output_ascii_escape_sequence(ASCII);
2619 else if (c2 == ISO_8859_1) {
2620 output_ascii_escape_sequence(ISO_8859_1);
2623 else if (c2 == JIS_X_0201_1976_K) {
2624 output_escape_sequence(JIS_X_0201_1976_K);
2627 } else if (is_eucg3(c2)){
2628 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2629 (*o_putc)(c2 & 0x7f);
2634 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2635 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2636 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2643 e_oconv(nkf_char c2, nkf_char c1)
2645 if (c2 == 0 && nkf_char_unicode_p(c1)){
2646 w16e_conv(c1, &c2, &c1);
2647 if (c2 == 0 && nkf_char_unicode_p(c1)){
2648 c2 = c1 & VALUE_MASK;
2649 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2653 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2654 c1 = 0x21 + c1 % 94;
2657 (*o_putc)((c2 & 0x7f) | 0x080);
2658 (*o_putc)(c1 | 0x080);
2660 (*o_putc)((c2 & 0x7f) | 0x080);
2661 (*o_putc)(c1 | 0x080);
2665 if (encode_fallback) (*encode_fallback)(c1);
2673 } else if (c2 == 0) {
2674 output_mode = ASCII;
2676 } else if (c2 == JIS_X_0201_1976_K) {
2677 output_mode = EUC_JP;
2678 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2679 } else if (c2 == ISO_8859_1) {
2680 output_mode = ISO_8859_1;
2681 (*o_putc)(c1 | 0x080);
2683 } else if (is_eucg3(c2)){
2684 output_mode = EUC_JP;
2685 #ifdef SHIFTJIS_CP932
2688 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2689 s2e_conv(s2, s1, &c2, &c1);
2694 output_mode = ASCII;
2696 }else if (is_eucg3(c2)){
2699 (*o_putc)((c2 & 0x7f) | 0x080);
2700 (*o_putc)(c1 | 0x080);
2703 (*o_putc)((c2 & 0x7f) | 0x080);
2704 (*o_putc)(c1 | 0x080);
2708 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2709 set_iconv(FALSE, 0);
2710 return; /* too late to rescue this char */
2712 output_mode = EUC_JP;
2713 (*o_putc)(c2 | 0x080);
2714 (*o_putc)(c1 | 0x080);
2719 s_oconv(nkf_char c2, nkf_char c1)
2721 #ifdef NUMCHAR_OPTION
2722 if (c2 == 0 && nkf_char_unicode_p(c1)){
2723 w16e_conv(c1, &c2, &c1);
2724 if (c2 == 0 && nkf_char_unicode_p(c1)){
2725 c2 = c1 & VALUE_MASK;
2726 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2729 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2731 c1 += 0x40 + (c1 > 0x3e);
2736 if(encode_fallback)(*encode_fallback)(c1);
2745 } else if (c2 == 0) {
2746 output_mode = ASCII;
2748 } else if (c2 == JIS_X_0201_1976_K) {
2749 output_mode = SHIFT_JIS;
2751 } else if (c2 == ISO_8859_1) {
2752 output_mode = ISO_8859_1;
2753 (*o_putc)(c1 | 0x080);
2755 } else if (is_eucg3(c2)){
2756 output_mode = SHIFT_JIS;
2757 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2763 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2764 set_iconv(FALSE, 0);
2765 return; /* too late to rescue this char */
2767 output_mode = SHIFT_JIS;
2768 e2s_conv(c2, c1, &c2, &c1);
2770 #ifdef SHIFTJIS_CP932
2772 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2773 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2779 #endif /* SHIFTJIS_CP932 */
2782 if (prefix_table[(unsigned char)c1]){
2783 (*o_putc)(prefix_table[(unsigned char)c1]);
2789 #ifdef UTF8_OUTPUT_ENABLE
2790 #define OUTPUT_UTF8(val) do { \
2791 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); \
2793 if (c2) (*o_putc)(c2); \
2794 if (c3) (*o_putc)(c3); \
2795 if (c4) (*o_putc)(c4); \
2799 w_oconv(nkf_char c2, nkf_char c1)
2805 output_bom_f = FALSE;
2816 if (c2 == 0 && nkf_char_unicode_p(c1)){
2817 val = c1 & VALUE_MASK;
2825 val = e2w_conv(c2, c1);
2827 val2 = e2w_combining(val, c2, c1);
2835 #define OUTPUT_UTF16_BYTES(c1, c2) do { \
2836 if (output_endian == ENDIAN_LITTLE){ \
2845 #define OUTPUT_UTF16(val) do { \
2846 if (nkf_char_unicode_bmp_p(val)) { \
2847 c2 = (val >> 8) & 0xff; \
2849 OUTPUT_UTF16_BYTES(c1, c2); \
2851 val &= VALUE_MASK; \
2852 if (val <= UNICODE_MAX) { \
2853 c2 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ \
2854 c1 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ \
2855 OUTPUT_UTF16_BYTES(c2 & 0xff, (c2 >> 8) & 0xff); \
2856 OUTPUT_UTF16_BYTES(c1 & 0xff, (c1 >> 8) & 0xff); \
2862 w_oconv16(nkf_char c2, nkf_char c1)
2865 output_bom_f = FALSE;
2866 OUTPUT_UTF16_BYTES(0xFF, 0xFE);
2874 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2878 val = e2w_conv(c2, c1);
2880 val2 = e2w_combining(val, c2, c1);
2885 OUTPUT_UTF16_BYTES(c1, c2);
2889 #define OUTPUT_UTF32(c) do { \
2890 if (output_endian == ENDIAN_LITTLE){ \
2891 (*o_putc)( (c) & 0xFF); \
2892 (*o_putc)(((c) >> 8) & 0xFF); \
2893 (*o_putc)(((c) >> 16) & 0xFF); \
2897 (*o_putc)(((c) >> 16) & 0xFF); \
2898 (*o_putc)(((c) >> 8) & 0xFF); \
2899 (*o_putc)( (c) & 0xFF); \
2904 w_oconv32(nkf_char c2, nkf_char c1)
2907 output_bom_f = FALSE;
2908 if (output_endian == ENDIAN_LITTLE){
2926 if (c2 == ISO_8859_1) {
2928 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2932 val = e2w_conv(c2, c1);
2934 val2 = e2w_combining(val, c2, c1);
2943 #define SCORE_L2 (1) /* Kanji Level 2 */
2944 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2945 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2946 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2947 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2948 #define SCORE_X0213 (SCORE_X0212 << 1) /* JIS X 0213 */
2949 #define SCORE_NO_EXIST (SCORE_X0213 << 1) /* Undefined Characters */
2950 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2951 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2953 #define SCORE_INIT (SCORE_iMIME)
2955 static const nkf_char score_table_A0[] = {
2958 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2959 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_X0213,
2962 static const nkf_char score_table_F0[] = {
2963 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2964 SCORE_L2, SCORE_DEPEND, SCORE_X0213, SCORE_X0213,
2965 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2966 SCORE_CP932, SCORE_X0213, SCORE_X0213, SCORE_ERROR,
2969 static const nkf_char score_table_8FA0[] = {
2970 0, SCORE_X0213, SCORE_X0212, SCORE_X0213,
2971 SCORE_X0213, SCORE_X0213, SCORE_X0212, SCORE_X0212,
2972 SCORE_X0213, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2973 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2976 static const nkf_char score_table_8FE0[] = {
2977 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2978 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2979 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2980 SCORE_X0212, SCORE_X0212, SCORE_X0213, SCORE_X0213,
2983 static const nkf_char score_table_8FF0[] = {
2984 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0212,
2985 SCORE_X0212, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2986 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2987 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2991 set_code_score(struct input_code *ptr, nkf_char score)
2994 ptr->score |= score;
2999 clr_code_score(struct input_code *ptr, nkf_char score)
3002 ptr->score &= ~score;
3007 code_score(struct input_code *ptr)
3009 nkf_char c2 = ptr->buf[0];
3010 nkf_char c1 = ptr->buf[1];
3012 set_code_score(ptr, SCORE_ERROR);
3013 }else if (c2 == SS2){
3014 set_code_score(ptr, SCORE_KANA);
3015 }else if (c2 == 0x8f){
3016 if ((c1 & 0x70) == 0x20){
3017 set_code_score(ptr, score_table_8FA0[c1 & 0x0f]);
3018 }else if ((c1 & 0x70) == 0x60){
3019 set_code_score(ptr, score_table_8FE0[c1 & 0x0f]);
3020 }else if ((c1 & 0x70) == 0x70){
3021 set_code_score(ptr, score_table_8FF0[c1 & 0x0f]);
3023 set_code_score(ptr, SCORE_X0212);
3025 #ifdef UTF8_OUTPUT_ENABLE
3026 }else if (!e2w_conv(c2, c1)){
3027 set_code_score(ptr, SCORE_NO_EXIST);
3029 }else if ((c2 & 0x70) == 0x20){
3030 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
3031 }else if ((c2 & 0x70) == 0x70){
3032 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
3033 }else if ((c2 & 0x70) >= 0x50){
3034 set_code_score(ptr, SCORE_L2);
3039 status_disable(struct input_code *ptr)
3044 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
3048 status_push_ch(struct input_code *ptr, nkf_char c)
3050 ptr->buf[ptr->index++] = c;
3054 status_clear(struct input_code *ptr)
3061 status_reset(struct input_code *ptr)
3064 ptr->score = SCORE_INIT;
3068 status_reinit(struct input_code *ptr)
3071 ptr->_file_stat = 0;
3075 status_check(struct input_code *ptr, nkf_char c)
3077 if (c <= DEL && estab_f){
3083 s_status(struct input_code *ptr, nkf_char c)
3087 status_check(ptr, c);
3092 }else if (nkf_char_unicode_p(c)){
3094 }else if (0xa1 <= c && c <= 0xdf){
3095 status_push_ch(ptr, SS2);
3096 status_push_ch(ptr, c);
3099 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
3101 status_push_ch(ptr, c);
3102 }else if (0xed <= c && c <= 0xee){
3104 status_push_ch(ptr, c);
3105 #ifdef SHIFTJIS_CP932
3106 }else if (is_ibmext_in_sjis(c)){
3108 status_push_ch(ptr, c);
3109 #endif /* SHIFTJIS_CP932 */
3111 }else if (0xf0 <= c && c <= 0xfc){
3113 status_push_ch(ptr, c);
3114 #endif /* X0212_ENABLE */
3116 status_disable(ptr);
3120 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3121 status_push_ch(ptr, c);
3122 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3126 status_disable(ptr);
3130 #ifdef SHIFTJIS_CP932
3131 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
3132 status_push_ch(ptr, c);
3133 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
3134 set_code_score(ptr, SCORE_CP932);
3139 #endif /* SHIFTJIS_CP932 */
3140 status_disable(ptr);
3143 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3144 status_push_ch(ptr, c);
3145 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3146 set_code_score(ptr, SCORE_CP932);
3149 status_disable(ptr);
3156 e_status(struct input_code *ptr, nkf_char c)
3160 status_check(ptr, c);
3165 }else if (nkf_char_unicode_p(c)){
3167 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
3169 status_push_ch(ptr, c);
3171 }else if (0x8f == c){
3173 status_push_ch(ptr, c);
3174 #endif /* X0212_ENABLE */
3176 status_disable(ptr);
3180 if (0xa1 <= c && c <= 0xfe){
3181 status_push_ch(ptr, c);
3185 status_disable(ptr);
3190 if (0xa1 <= c && c <= 0xfe){
3192 status_push_ch(ptr, c);
3194 status_disable(ptr);
3196 #endif /* X0212_ENABLE */
3200 #ifdef UTF8_INPUT_ENABLE
3202 w_status(struct input_code *ptr, nkf_char c)
3206 status_check(ptr, c);
3211 }else if (nkf_char_unicode_p(c)){
3213 }else if (0xc0 <= c && c <= 0xdf){
3215 status_push_ch(ptr, c);
3216 }else if (0xe0 <= c && c <= 0xef){
3218 status_push_ch(ptr, c);
3219 }else if (0xf0 <= c && c <= 0xf4){
3221 status_push_ch(ptr, c);
3223 status_disable(ptr);
3228 if (0x80 <= c && c <= 0xbf){
3229 status_push_ch(ptr, c);
3230 if (ptr->index > ptr->stat){
3231 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
3232 && ptr->buf[2] == 0xbf);
3233 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
3234 &ptr->buf[0], &ptr->buf[1]);
3241 status_disable(ptr);
3245 if (0x80 <= c && c <= 0xbf){
3246 if (ptr->index < ptr->stat){
3247 status_push_ch(ptr, c);
3252 status_disable(ptr);
3260 code_status(nkf_char c)
3262 int action_flag = 1;
3263 struct input_code *result = 0;
3264 struct input_code *p = input_code_list;
3266 if (!p->status_func) {
3270 if (!p->status_func)
3272 (p->status_func)(p, c);
3275 }else if(p->stat == 0){
3286 if (result && !estab_f){
3287 set_iconv(TRUE, result->iconv_func);
3288 }else if (c <= DEL){
3289 struct input_code *ptr = input_code_list;
3299 nkf_buf_t *std_gc_buf;
3300 nkf_char broken_state;
3301 nkf_buf_t *broken_buf;
3302 nkf_char mimeout_state;
3306 static nkf_state_t *nkf_state = NULL;
3308 #define STD_GC_BUFSIZE (256)
3311 nkf_state_init(void)
3314 nkf_buf_clear(nkf_state->std_gc_buf);
3315 nkf_buf_clear(nkf_state->broken_buf);
3316 nkf_buf_clear(nkf_state->nfc_buf);
3319 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3320 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3321 nkf_state->broken_buf = nkf_buf_new(3);
3322 nkf_state->nfc_buf = nkf_buf_new(9);
3324 nkf_state->broken_state = 0;
3325 nkf_state->mimeout_state = 0;
3332 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3333 return nkf_buf_pop(nkf_state->std_gc_buf);
3340 std_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3342 nkf_buf_push(nkf_state->std_gc_buf, c);
3348 std_putc(nkf_char c)
3355 static nkf_char hold_buf[HOLD_SIZE*2];
3356 static int hold_count = 0;
3358 push_hold_buf(nkf_char c2)
3360 if (hold_count >= HOLD_SIZE*2)
3362 hold_buf[hold_count++] = c2;
3363 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3367 h_conv(FILE *f, nkf_char c1, nkf_char c2)
3374 /** it must NOT be in the kanji shifte sequence */
3375 /** it must NOT be written in JIS7 */
3376 /** and it must be after 2 byte 8bit code */
3382 while ((c2 = (*i_getc)(f)) != EOF) {
3388 if (push_hold_buf(c2) == EOF || estab_f) {
3394 struct input_code *p = input_code_list;
3395 struct input_code *result = p;
3400 if (p->status_func && p->score < result->score) {
3405 set_iconv(TRUE, result->iconv_func);
3410 ** 1) EOF is detected, or
3411 ** 2) Code is established, or
3412 ** 3) Buffer is FULL (but last word is pushed)
3414 ** in 1) and 3) cases, we continue to use
3415 ** Kanji codes by oconv and leave estab_f unchanged.
3420 while (hold_index < hold_count){
3421 c1 = hold_buf[hold_index++];
3422 if (nkf_char_unicode_p(c1)) {
3426 else if (c1 <= DEL){
3429 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3430 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3434 if (hold_index < hold_count){
3435 c2 = hold_buf[hold_index++];
3446 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3449 if (hold_index < hold_count){
3450 c3 = hold_buf[hold_index++];
3451 } else if ((c3 = (*i_getc)(f)) == EOF) {
3456 if (hold_index < hold_count){
3457 c4 = hold_buf[hold_index++];
3458 } else if ((c4 = (*i_getc)(f)) == EOF) {
3463 (*iconv)(c1, c2, (c3<<8)|c4);
3466 /* 4 bytes UTF-8 (check combining character) */
3467 if (hold_index < hold_count){
3468 c3 = hold_buf[hold_index++];
3470 } else if ((c3 = (*i_getc)(f)) == EOF) {
3471 w_iconv_nocombine(c1, c2, 0);
3474 if (hold_index < hold_count){
3475 c4 = hold_buf[hold_index++];
3477 } else if ((c4 = (*i_getc)(f)) == EOF) {
3478 w_iconv_nocombine(c1, c2, 0);
3479 if (fromhold_count <= 2)
3485 if (w_iconv_combine(c1, c2, 0, c3, c4, 0)) {
3486 w_iconv_nocombine(c1, c2, 0);
3487 if (fromhold_count <= 2) {
3490 } else if (fromhold_count == 3) {
3499 /* 3 bytes EUC or UTF-8 */
3500 if (hold_index < hold_count){
3501 c3 = hold_buf[hold_index++];
3503 } else if ((c3 = (*i_getc)(f)) == EOF) {
3509 if ((*iconv)(c1, c2, c3) == -3) {
3510 /* 6 bytes UTF-8 (check combining character) */
3512 if (hold_index < hold_count){
3513 c4 = hold_buf[hold_index++];
3515 } else if ((c4 = (*i_getc)(f)) == EOF) {
3516 w_iconv_nocombine(c1, c2, c3);
3519 if (hold_index < hold_count){
3520 c5 = hold_buf[hold_index++];
3522 } else if ((c5 = (*i_getc)(f)) == EOF) {
3523 w_iconv_nocombine(c1, c2, c3);
3524 if (fromhold_count == 4)
3530 if (hold_index < hold_count){
3531 c6 = hold_buf[hold_index++];
3533 } else if ((c6 = (*i_getc)(f)) == EOF) {
3534 w_iconv_nocombine(c1, c2, c3);
3535 if (fromhold_count == 5) {
3537 } else if (fromhold_count == 4) {
3546 if (w_iconv_combine(c1, c2, c3, c4, c5, c6)) {
3547 w_iconv_nocombine(c1, c2, c3);
3548 if (fromhold_count == 6) {
3550 } else if (fromhold_count == 5) {
3553 } else if (fromhold_count == 4) {
3566 if (c3 == EOF) break;
3572 * Check and Ignore BOM
3578 input_bom_f = FALSE;
3579 switch(c2 = (*i_getc)(f)){
3581 if((c2 = (*i_getc)(f)) == 0x00){
3582 if((c2 = (*i_getc)(f)) == 0xFE){
3583 if((c2 = (*i_getc)(f)) == 0xFF){
3584 if(!input_encoding){
3585 set_iconv(TRUE, w_iconv32);
3587 if (iconv == w_iconv32) {
3589 input_endian = ENDIAN_BIG;
3592 (*i_ungetc)(0xFF,f);
3593 }else (*i_ungetc)(c2,f);
3594 (*i_ungetc)(0xFE,f);
3595 }else if(c2 == 0xFF){
3596 if((c2 = (*i_getc)(f)) == 0xFE){
3597 if(!input_encoding){
3598 set_iconv(TRUE, w_iconv32);
3600 if (iconv == w_iconv32) {
3601 input_endian = ENDIAN_2143;
3604 (*i_ungetc)(0xFF,f);
3605 }else (*i_ungetc)(c2,f);
3606 (*i_ungetc)(0xFF,f);
3607 }else (*i_ungetc)(c2,f);
3608 (*i_ungetc)(0x00,f);
3609 }else (*i_ungetc)(c2,f);
3610 (*i_ungetc)(0x00,f);
3613 if((c2 = (*i_getc)(f)) == 0xBB){
3614 if((c2 = (*i_getc)(f)) == 0xBF){
3615 if(!input_encoding){
3616 set_iconv(TRUE, w_iconv);
3618 if (iconv == w_iconv) {
3622 (*i_ungetc)(0xBF,f);
3623 }else (*i_ungetc)(c2,f);
3624 (*i_ungetc)(0xBB,f);
3625 }else (*i_ungetc)(c2,f);
3626 (*i_ungetc)(0xEF,f);
3629 if((c2 = (*i_getc)(f)) == 0xFF){
3630 if((c2 = (*i_getc)(f)) == 0x00){
3631 if((c2 = (*i_getc)(f)) == 0x00){
3632 if(!input_encoding){
3633 set_iconv(TRUE, w_iconv32);
3635 if (iconv == w_iconv32) {
3636 input_endian = ENDIAN_3412;
3639 (*i_ungetc)(0x00,f);
3640 }else (*i_ungetc)(c2,f);
3641 (*i_ungetc)(0x00,f);
3642 }else (*i_ungetc)(c2,f);
3643 if(!input_encoding){
3644 set_iconv(TRUE, w_iconv16);
3646 if (iconv == w_iconv16) {
3647 input_endian = ENDIAN_BIG;
3651 (*i_ungetc)(0xFF,f);
3652 }else (*i_ungetc)(c2,f);
3653 (*i_ungetc)(0xFE,f);
3656 if((c2 = (*i_getc)(f)) == 0xFE){
3657 if((c2 = (*i_getc)(f)) == 0x00){
3658 if((c2 = (*i_getc)(f)) == 0x00){
3659 if(!input_encoding){
3660 set_iconv(TRUE, w_iconv32);
3662 if (iconv == w_iconv32) {
3663 input_endian = ENDIAN_LITTLE;
3667 (*i_ungetc)(0x00,f);
3668 }else (*i_ungetc)(c2,f);
3669 (*i_ungetc)(0x00,f);
3670 }else (*i_ungetc)(c2,f);
3671 if(!input_encoding){
3672 set_iconv(TRUE, w_iconv16);
3674 if (iconv == w_iconv16) {
3675 input_endian = ENDIAN_LITTLE;
3679 (*i_ungetc)(0xFE,f);
3680 }else (*i_ungetc)(c2,f);
3681 (*i_ungetc)(0xFF,f);
3690 broken_getc(FILE *f)
3694 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3695 return nkf_buf_pop(nkf_state->broken_buf);
3698 if (c=='$' && nkf_state->broken_state != ESC
3699 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3701 nkf_state->broken_state = 0;
3702 if (c1=='@'|| c1=='B') {
3703 nkf_buf_push(nkf_state->broken_buf, c1);
3704 nkf_buf_push(nkf_state->broken_buf, c);
3710 } else if (c=='(' && nkf_state->broken_state != ESC
3711 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3713 nkf_state->broken_state = 0;
3714 if (c1=='J'|| c1=='B') {
3715 nkf_buf_push(nkf_state->broken_buf, c1);
3716 nkf_buf_push(nkf_state->broken_buf, c);
3723 nkf_state->broken_state = c;
3729 broken_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3731 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3732 nkf_buf_push(nkf_state->broken_buf, c);
3737 eol_conv(nkf_char c2, nkf_char c1)
3739 if (guess_f && input_eol != EOF) {
3740 if (c2 == 0 && c1 == LF) {
3741 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3742 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3743 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3745 else if (!input_eol) input_eol = CR;
3746 else if (input_eol != CR) input_eol = EOF;
3748 if (prev_cr || (c2 == 0 && c1 == LF)) {
3750 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3751 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3753 if (c2 == 0 && c1 == CR) prev_cr = CR;
3754 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3758 put_newline(void (*func)(nkf_char))
3760 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3775 oconv_newline(void (*func)(nkf_char, nkf_char))
3777 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3792 Return value of fold_conv()
3794 LF add newline and output char
3795 CR add newline and output nothing
3798 1 (or else) normal output
3800 fold state in prev (previous character)
3802 >0x80 Japanese (X0208/X0201)
3807 This fold algorthm does not preserve heading space in a line.
3808 This is the main difference from fmt.
3811 #define char_size(c2,c1) (c2?2:1)
3814 fold_conv(nkf_char c2, nkf_char c1)
3817 nkf_char fold_state;
3819 if (c1== CR && !fold_preserve_f) {
3820 fold_state=0; /* ignore cr */
3821 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3823 fold_state=0; /* ignore cr */
3824 } else if (c1== BS) {
3825 if (f_line>0) f_line--;
3827 } else if (c2==EOF && f_line != 0) { /* close open last line */
3829 } else if ((c1==LF && !fold_preserve_f)
3830 || ((c1==CR||(c1==LF&&f_prev!=CR))
3831 && fold_preserve_f)) {
3833 if (fold_preserve_f) {
3837 } else if ((f_prev == c1)
3839 ) { /* duplicate newline */
3842 fold_state = LF; /* output two newline */
3848 if (f_prev&0x80) { /* Japanese? */
3850 fold_state = 0; /* ignore given single newline */
3851 } else if (f_prev==SP) {
3855 if (++f_line<=fold_len)
3859 fold_state = CR; /* fold and output nothing */
3863 } else if (c1=='\f') {
3866 fold_state = LF; /* output newline and clear */
3867 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3868 /* X0208 kankaku or ascii space */
3870 fold_state = 0; /* remove duplicate spaces */
3873 if (++f_line<=fold_len)
3874 fold_state = SP; /* output ASCII space only */
3876 f_prev = SP; f_line = 0;
3877 fold_state = CR; /* fold and output nothing */
3881 prev0 = f_prev; /* we still need this one... , but almost done */
3883 if (c2 || c2 == JIS_X_0201_1976_K)
3884 f_prev |= 0x80; /* this is Japanese */
3885 f_line += c2 == JIS_X_0201_1976_K ? 1: char_size(c2,c1);
3886 if (f_line<=fold_len) { /* normal case */
3889 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3890 f_line = char_size(c2,c1);
3891 fold_state = LF; /* We can't wait, do fold now */
3892 } else if (c2 == JIS_X_0201_1976_K) {
3893 /* simple kinsoku rules return 1 means no folding */
3894 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3895 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3896 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3897 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3898 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3899 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3900 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3902 fold_state = LF;/* add one new f_line before this character */
3905 fold_state = LF;/* add one new f_line before this character */
3908 /* kinsoku point in ASCII */
3909 if ( c1==')'|| /* { [ ( */
3920 /* just after special */
3921 } else if (!is_alnum(prev0)) {
3922 f_line = char_size(c2,c1);
3924 } else if ((prev0==SP) || /* ignored new f_line */
3925 (prev0==LF)|| /* ignored new f_line */
3926 (prev0&0x80)) { /* X0208 - ASCII */
3927 f_line = char_size(c2,c1);
3928 fold_state = LF;/* add one new f_line before this character */
3930 fold_state = 1; /* default no fold in ASCII */
3934 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3935 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3936 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3937 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3938 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3939 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3940 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3941 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3942 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3943 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3944 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3945 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3946 /* default no fold in kinsoku */
3949 f_line = char_size(c2,c1);
3950 /* add one new f_line before this character */
3953 f_line = char_size(c2,c1);
3955 /* add one new f_line before this character */
3960 /* terminator process */
3961 switch(fold_state) {
3963 oconv_newline(o_fconv);
3969 oconv_newline(o_fconv);
3980 static nkf_char z_prev2=0,z_prev1=0;
3983 z_conv(nkf_char c2, nkf_char c1)
3986 /* if (c2) c1 &= 0x7f; assertion */
3988 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3994 if (z_prev2 == JIS_X_0201_1976_K) {
3995 if (c2 == JIS_X_0201_1976_K) {
3996 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3998 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4000 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4002 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4004 } else if (x0213_f && c1 == (0xdf&0x7f) && ev_x0213[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4006 (*o_zconv)(ev_x0213[(z_prev1-SP)*2], ev_x0213[(z_prev1-SP)*2+1]);
4011 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4013 if (c2 == JIS_X_0201_1976_K) {
4014 if (dv[(c1-SP)*2] || ev[(c1-SP)*2] || (x0213_f && ev_x0213[(c1-SP)*2])) {
4015 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4020 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4031 if (alpha_f&1 && c2 == 0x23) {
4032 /* JISX0208 Alphabet */
4034 } else if (c2 == 0x21) {
4035 /* JISX0208 Kigou */
4040 } else if (alpha_f&4) {
4045 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4051 if (alpha_f&8 && c2 == 0) {
4053 const char *entity = 0;
4055 case '>': entity = ">"; break;
4056 case '<': entity = "<"; break;
4057 case '\"': entity = """; break;
4058 case '&': entity = "&"; break;
4061 while (*entity) (*o_zconv)(0, *entity++);
4067 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4072 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4076 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4080 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4084 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4088 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4092 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4096 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4100 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4105 (*o_zconv)(JIS_X_0201_1976_K, c);
4108 } else if (c2 == 0x25) {
4109 /* JISX0208 Katakana */
4110 static const int fullwidth_to_halfwidth[] =
4112 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4113 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4114 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4115 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4116 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4117 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4118 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4119 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4120 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4121 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4122 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x365F,
4123 0x375F, 0x385F, 0x395F, 0x3A5F, 0x3E5F, 0x425F, 0x445F, 0x0000
4125 if (fullwidth_to_halfwidth[c1-0x20]){
4126 c2 = fullwidth_to_halfwidth[c1-0x20];
4127 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
4129 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
4133 } else if (c2 == 0 && nkf_char_unicode_p(c1) &&
4134 ((c1&VALUE_MASK) == 0x3099 || (c1&VALUE_MASK) == 0x309A)) { /*
\e$B9g@.MQByE@!&H>ByE@
\e(B */
4135 (*o_zconv)(JIS_X_0201_1976_K, 0x5E + (c1&VALUE_MASK) - 0x3099);
4143 #define rot13(c) ( \
4145 (c <= 'M') ? (c + 13): \
4146 (c <= 'Z') ? (c - 13): \
4148 (c <= 'm') ? (c + 13): \
4149 (c <= 'z') ? (c - 13): \
4153 #define rot47(c) ( \
4155 ( c <= 'O') ? (c + 47) : \
4156 ( c <= '~') ? (c - 47) : \
4161 rot_conv(nkf_char c2, nkf_char c1)
4163 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
4169 (*o_rot_conv)(c2,c1);
4173 hira_conv(nkf_char c2, nkf_char c1)
4177 if (0x20 < c1 && c1 < 0x74) {
4179 (*o_hira_conv)(c2,c1);
4181 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
4183 c1 = nkf_char_unicode_new(0x3094);
4184 (*o_hira_conv)(c2,c1);
4187 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4189 (*o_hira_conv)(c2,c1);
4194 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
4197 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4199 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4203 (*o_hira_conv)(c2,c1);
4208 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4210 #define RANGE_NUM_MAX 18
4211 static const nkf_char range[RANGE_NUM_MAX][2] = {
4232 nkf_char start, end, c;
4234 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4238 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4243 for (i = 0; i < RANGE_NUM_MAX; i++) {
4244 start = range[i][0];
4247 if (c >= start && c <= end) {
4252 (*o_iso2022jp_check_conv)(c2,c1);
4256 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4258 static const unsigned char *mime_pattern[] = {
4259 (const unsigned char *)"\075?EUC-JP?B?",
4260 (const unsigned char *)"\075?SHIFT_JIS?B?",
4261 (const unsigned char *)"\075?ISO-8859-1?Q?",
4262 (const unsigned char *)"\075?ISO-8859-1?B?",
4263 (const unsigned char *)"\075?ISO-2022-JP?B?",
4264 (const unsigned char *)"\075?ISO-2022-JP?B?",
4265 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4266 #if defined(UTF8_INPUT_ENABLE)
4267 (const unsigned char *)"\075?UTF-8?B?",
4268 (const unsigned char *)"\075?UTF-8?Q?",
4270 (const unsigned char *)"\075?US-ASCII?Q?",
4275 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4276 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4277 e_iconv, s_iconv, 0, 0, 0, 0, 0,
4278 #if defined(UTF8_INPUT_ENABLE)
4284 static const nkf_char mime_encode[] = {
4285 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K,
4286 #if defined(UTF8_INPUT_ENABLE)
4293 static const nkf_char mime_encode_method[] = {
4294 'B', 'B','Q', 'B', 'B', 'B', 'Q',
4295 #if defined(UTF8_INPUT_ENABLE)
4303 /* MIME preprocessor fifo */
4305 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
4306 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
4307 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
4309 unsigned char buf[MIME_BUF_SIZE];
4311 unsigned int last; /* decoded */
4312 unsigned int input; /* undecoded */
4314 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
4316 #define MAXRECOVER 20
4319 mime_input_buf_unshift(nkf_char c)
4321 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
4325 mime_ungetc(nkf_char c, ARG_UNUSED FILE *f)
4327 mime_input_buf_unshift(c);
4332 mime_ungetc_buf(nkf_char c, FILE *f)
4335 (*i_mungetc_buf)(c,f);
4337 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
4342 mime_getc_buf(FILE *f)
4344 /* we don't keep eof of mime_input_buf, because it contains ?= as
4345 a terminator. It was checked in mime_integrity. */
4346 return ((mimebuf_f)?
4347 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
4351 switch_mime_getc(void)
4353 if (i_getc!=mime_getc) {
4354 i_mgetc = i_getc; i_getc = mime_getc;
4355 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4356 if(mime_f==STRICT_MIME) {
4357 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4358 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4364 unswitch_mime_getc(void)
4366 if(mime_f==STRICT_MIME) {
4367 i_mgetc = i_mgetc_buf;
4368 i_mungetc = i_mungetc_buf;
4371 i_ungetc = i_mungetc;
4372 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4373 mime_iconv_back = NULL;
4377 mime_integrity(FILE *f, const unsigned char *p)
4381 /* In buffered mode, read until =? or NL or buffer full
4383 mime_input_state.input = mime_input_state.top;
4384 mime_input_state.last = mime_input_state.top;
4386 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
4388 q = mime_input_state.input;
4389 while((c=(*i_getc)(f))!=EOF) {
4390 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
4391 break; /* buffer full */
4393 if (c=='=' && d=='?') {
4394 /* checked. skip header, start decode */
4395 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4396 /* mime_last_input = mime_input_state.input; */
4397 mime_input_state.input = q;
4401 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4403 /* Should we check length mod 4? */
4404 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4407 /* In case of Incomplete MIME, no MIME decode */
4408 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4409 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
4410 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
4411 switch_mime_getc(); /* anyway we need buffered getc */
4416 mime_begin_strict(FILE *f)
4420 const unsigned char *p,*q;
4421 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4423 mime_decode_mode = FALSE;
4424 /* =? has been checked */
4426 p = mime_pattern[j];
4429 for(i=2;p[i]>SP;i++) { /* start at =? */
4430 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4431 /* pattern fails, try next one */
4433 while (mime_pattern[++j]) {
4434 p = mime_pattern[j];
4435 for(k=2;k<i;k++) /* assume length(p) > i */
4436 if (p[k]!=q[k]) break;
4437 if (k==i && nkf_toupper(c1)==p[k]) break;
4439 p = mime_pattern[j];
4440 if (p) continue; /* found next one, continue */
4441 /* all fails, output from recovery buffer */
4449 mime_decode_mode = p[i-2];
4451 mime_iconv_back = iconv;
4452 set_iconv(FALSE, mime_priority_func[j]);
4453 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4455 if (mime_decode_mode=='B') {
4456 mimebuf_f = unbuf_f;
4458 /* do MIME integrity check */
4459 return mime_integrity(f,mime_pattern[j]);
4473 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4474 /* re-read and convert again from mime_buffer. */
4476 /* =? has been checked */
4477 k = mime_input_state.last;
4478 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4479 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4480 /* We accept any character type even if it is breaked by new lines */
4481 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4482 if (c1==LF||c1==SP||c1==CR||
4483 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4485 /* Failed. But this could be another MIME preemble */
4487 mime_input_state.last--;
4493 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4494 if (!(++i<MAXRECOVER) || c1==EOF) break;
4495 if (c1=='b'||c1=='B') {
4496 mime_decode_mode = 'B';
4497 } else if (c1=='q'||c1=='Q') {
4498 mime_decode_mode = 'Q';
4502 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4503 if (!(++i<MAXRECOVER) || c1==EOF) break;
4505 mime_decode_mode = FALSE;
4511 if (!mime_decode_mode) {
4512 /* false MIME premble, restart from mime_buffer */
4513 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4514 /* Since we are in MIME mode until buffer becomes empty, */
4515 /* we never go into mime_begin again for a while. */
4518 /* discard mime preemble, and goto MIME mode */
4519 mime_input_state.last = k;
4520 /* do no MIME integrity check */
4521 return c1; /* used only for checking EOF */
4526 no_putc(ARG_UNUSED nkf_char c)
4532 debug(const char *str)
4535 fprintf(stderr, "%s\n", str ? str : "NULL");
4541 set_input_codename(const char *codename)
4543 if (!input_codename) {
4544 input_codename = codename;
4545 } else if (strcmp(codename, input_codename) != 0) {
4546 input_codename = "";
4551 get_guessed_code(void)
4553 if (input_codename && !*input_codename) {
4554 input_codename = "BINARY";
4556 struct input_code *p = find_inputcode_byfunc(iconv);
4557 if (!input_codename) {
4558 input_codename = "ASCII";
4559 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4560 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4561 input_codename = "CP932";
4562 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4563 if (p->score & SCORE_X0213)
4564 input_codename = "EUC-JIS-2004";
4565 else if (p->score & (SCORE_X0212))
4566 input_codename = "EUCJP-MS";
4567 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4568 input_codename = "CP51932";
4569 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4570 if (p->score & (SCORE_KANA))
4571 input_codename = "CP50221";
4572 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4573 input_codename = "CP50220";
4576 return input_codename;
4579 #if !defined(PERL_XS) && !defined(WIN32DLL)
4581 print_guessed_code(char *filename)
4583 if (filename != NULL) printf("%s: ", filename);
4584 if (input_codename && !*input_codename) {
4587 input_codename = get_guessed_code();
4589 printf("%s\n", input_codename);
4591 printf("%s%s%s%s\n",
4593 iconv != w_iconv16 && iconv != w_iconv32 ? "" :
4594 input_endian == ENDIAN_LITTLE ? " LE" :
4595 input_endian == ENDIAN_BIG ? " BE" :
4597 input_bom_f ? " (BOM)" : "",
4598 input_eol == CR ? " (CR)" :
4599 input_eol == LF ? " (LF)" :
4600 input_eol == CRLF ? " (CRLF)" :
4601 input_eol == EOF ? " (MIXED NL)" :
4611 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4613 nkf_char c1, c2, c3;
4619 if (!nkf_isxdigit(c2)){
4624 if (!nkf_isxdigit(c3)){
4629 return (hex2bin(c2) << 4) | hex2bin(c3);
4635 return hex_getc(':', f, i_cgetc, i_cungetc);
4639 cap_ungetc(nkf_char c, FILE *f)
4641 return (*i_cungetc)(c, f);
4647 return hex_getc('%', f, i_ugetc, i_uungetc);
4651 url_ungetc(nkf_char c, FILE *f)
4653 return (*i_uungetc)(c, f);
4657 #ifdef NUMCHAR_OPTION
4659 numchar_getc(FILE *f)
4661 nkf_char (*g)(FILE *) = i_ngetc;
4662 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4673 if (buf[i] == 'x' || buf[i] == 'X'){
4674 for (j = 0; j < 7; j++){
4676 if (!nkf_isxdigit(buf[i])){
4683 c |= hex2bin(buf[i]);
4686 for (j = 0; j < 8; j++){
4690 if (!nkf_isdigit(buf[i])){
4697 c += hex2bin(buf[i]);
4703 return nkf_char_unicode_new(c);
4713 numchar_ungetc(nkf_char c, FILE *f)
4715 return (*i_nungetc)(c, f);
4719 #ifdef UNICODE_NORMALIZATION
4724 nkf_char (*g)(FILE *f) = i_nfc_getc;
4725 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4726 nkf_buf_t *buf = nkf_state->nfc_buf;
4727 const unsigned char *array;
4728 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4729 nkf_char c = (*g)(f);
4731 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4733 nkf_buf_push(buf, c);
4735 while (lower <= upper) {
4736 int mid = (lower+upper) / 2;
4738 array = normalization_table[mid].nfd;
4739 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4740 if (len >= nkf_buf_length(buf)) {
4744 lower = 1, upper = 0;
4747 nkf_buf_push(buf, c);
4749 if (array[len] != nkf_buf_at(buf, len)) {
4750 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4751 else upper = mid - 1;
4758 array = normalization_table[mid].nfc;
4760 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4761 nkf_buf_push(buf, array[i]);
4765 } while (lower <= upper);
4767 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4768 c = nkf_buf_pop(buf);
4774 nfc_ungetc(nkf_char c, FILE *f)
4776 return (*i_nfc_ungetc)(c, f);
4778 #endif /* UNICODE_NORMALIZATION */
4782 base64decode(nkf_char c)
4787 i = c - 'A'; /* A..Z 0-25 */
4788 } else if (c == '_') {
4789 i = '?' /* 63 */ ; /* _ 63 */
4791 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4793 } else if (c > '/') {
4794 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4795 } else if (c == '+' || c == '-') {
4796 i = '>' /* 62 */ ; /* + and - 62 */
4798 i = '?' /* 63 */ ; /* / 63 */
4806 nkf_char c1, c2, c3, c4, cc;
4807 nkf_char t1, t2, t3, t4, mode, exit_mode;
4808 nkf_char lwsp_count;
4811 nkf_char lwsp_size = 128;
4813 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4814 return mime_input_buf(mime_input_state.top++);
4816 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4817 mime_decode_mode=FALSE;
4818 unswitch_mime_getc();
4819 return (*i_getc)(f);
4822 if (mimebuf_f == FIXED_MIME)
4823 exit_mode = mime_decode_mode;
4826 if (mime_decode_mode == 'Q') {
4827 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4829 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4830 if (c1<=SP || DEL<=c1) {
4831 mime_decode_mode = exit_mode; /* prepare for quit */
4834 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4838 mime_decode_mode = exit_mode; /* prepare for quit */
4839 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4840 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4841 /* end Q encoding */
4842 input_mode = exit_mode;
4844 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4845 while ((c1=(*i_getc)(f))!=EOF) {
4850 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4858 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4859 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4874 lwsp_buf[lwsp_count] = (unsigned char)c1;
4875 if (lwsp_count++>lwsp_size){
4877 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4878 lwsp_buf = lwsp_buf_new;
4884 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4886 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4887 i_ungetc(lwsp_buf[lwsp_count],f);
4890 nkf_xfree(lwsp_buf);
4893 if (c1=='='&&c2<SP) { /* this is soft wrap */
4894 while((c1 = (*i_mgetc)(f)) <=SP) {
4895 if (c1 == EOF) return (EOF);
4897 mime_decode_mode = 'Q'; /* still in MIME */
4898 goto restart_mime_q;
4901 mime_decode_mode = 'Q'; /* still in MIME */
4905 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4906 if (c2<=SP) return c2;
4907 mime_decode_mode = 'Q'; /* still in MIME */
4908 return ((hex2bin(c2)<<4) + hex2bin(c3));
4911 if (mime_decode_mode != 'B') {
4912 mime_decode_mode = FALSE;
4913 return (*i_mgetc)(f);
4917 /* Base64 encoding */
4919 MIME allows line break in the middle of
4920 Base64, but we are very pessimistic in decoding
4921 in unbuf mode because MIME encoded code may broken by
4922 less or editor's control sequence (such as ESC-[-K in unbuffered
4923 mode. ignore incomplete MIME.
4925 mode = mime_decode_mode;
4926 mime_decode_mode = exit_mode; /* prepare for quit */
4928 while ((c1 = (*i_mgetc)(f))<=SP) {
4933 if ((c2 = (*i_mgetc)(f))<=SP) {
4936 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4937 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4940 if ((c1 == '?') && (c2 == '=')) {
4943 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4944 while ((c1=(*i_getc)(f))!=EOF) {
4949 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4957 if ((c1=(*i_getc)(f))!=EOF) {
4961 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4976 lwsp_buf[lwsp_count] = (unsigned char)c1;
4977 if (lwsp_count++>lwsp_size){
4979 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4980 lwsp_buf = lwsp_buf_new;
4986 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4988 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4989 i_ungetc(lwsp_buf[lwsp_count],f);
4992 nkf_xfree(lwsp_buf);
4996 if ((c3 = (*i_mgetc)(f))<=SP) {
4999 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5000 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5004 if ((c4 = (*i_mgetc)(f))<=SP) {
5007 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5008 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5012 mime_decode_mode = mode; /* still in MIME sigh... */
5014 /* BASE 64 decoding */
5016 t1 = 0x3f & base64decode(c1);
5017 t2 = 0x3f & base64decode(c2);
5018 t3 = 0x3f & base64decode(c3);
5019 t4 = 0x3f & base64decode(c4);
5020 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5022 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5023 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5025 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5026 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5028 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5033 return mime_input_buf(mime_input_state.top++);
5036 static const char basis_64[] =
5037 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5039 #define MIMEOUT_BUF_LENGTH 74
5041 unsigned char buf[MIMEOUT_BUF_LENGTH+1];
5045 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5048 open_mime(nkf_char mode)
5050 const unsigned char *p;
5053 p = mime_pattern[0];
5054 for(i=0;mime_pattern[i];i++) {
5055 if (mode == mime_encode[i]) {
5056 p = mime_pattern[i];
5060 mimeout_mode = mime_encode_method[i];
5062 if (base64_count>45) {
5063 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
5064 (*o_mputc)(mimeout_state.buf[i]);
5067 put_newline(o_mputc);
5070 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
5074 for (;i<mimeout_state.count;i++) {
5075 if (nkf_isspace(mimeout_state.buf[i])) {
5076 (*o_mputc)(mimeout_state.buf[i]);
5086 j = mimeout_state.count;
5087 mimeout_state.count = 0;
5089 mime_putc(mimeout_state.buf[i]);
5094 mime_prechar(nkf_char c2, nkf_char c1)
5096 if (mimeout_mode > 0){
5098 if (base64_count + mimeout_state.count/3*4> 73){
5099 (*o_base64conv)(EOF,0);
5100 oconv_newline(o_base64conv);
5101 (*o_base64conv)(0,SP);
5105 if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) {
5106 (*o_base64conv)(EOF,0);
5107 oconv_newline(o_base64conv);
5108 (*o_base64conv)(0,SP);
5114 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
5115 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5116 open_mime(output_mode);
5117 (*o_base64conv)(EOF,0);
5118 oconv_newline(o_base64conv);
5119 (*o_base64conv)(0,SP);
5138 switch(mimeout_mode) {
5143 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
5149 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
5154 if (mimeout_mode > 0) {
5155 if (mimeout_f!=FIXED_MIME) {
5157 } else if (mimeout_mode != 'Q')
5163 mimeout_addchar(nkf_char c)
5165 switch(mimeout_mode) {
5170 } else if(!nkf_isalnum(c)) {
5172 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5173 (*o_mputc)(bin2hex((c&0xf)));
5181 nkf_state->mimeout_state=c;
5182 (*o_mputc)(basis_64[c>>2]);
5187 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5188 nkf_state->mimeout_state=c;
5193 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
5194 (*o_mputc)(basis_64[c & 0x3F]);
5206 mime_putc(nkf_char c)
5211 if (mimeout_f == FIXED_MIME){
5212 if (mimeout_mode == 'Q'){
5213 if (base64_count > 71){
5214 if (c!=CR && c!=LF) {
5216 put_newline(o_mputc);
5221 if (base64_count > 71){
5223 put_newline(o_mputc);
5226 if (c == EOF) { /* c==EOF */
5230 if (c != EOF) { /* c==EOF */
5236 /* mimeout_f != FIXED_MIME */
5238 if (c == EOF) { /* c==EOF */
5239 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
5240 j = mimeout_state.count;
5241 mimeout_state.count = 0;
5243 if (mimeout_mode > 0) {
5244 if (!nkf_isblank(mimeout_state.buf[j-1])) {
5246 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
5249 mimeout_addchar(mimeout_state.buf[i]);
5253 mimeout_addchar(mimeout_state.buf[i]);
5257 mimeout_addchar(mimeout_state.buf[i]);
5263 mimeout_addchar(mimeout_state.buf[i]);
5269 if (mimeout_state.count > 0){
5270 lastchar = mimeout_state.buf[mimeout_state.count - 1];
5275 if (mimeout_mode=='Q') {
5276 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5277 if (c == CR || c == LF) {
5282 } else if (c <= SP) {
5284 if (base64_count > 70) {
5285 put_newline(o_mputc);
5288 if (!nkf_isblank(c)) {
5293 if (base64_count > 70) {
5295 put_newline(o_mputc);
5298 open_mime(output_mode);
5300 if (!nkf_noescape_mime(c)) {
5313 if (mimeout_mode <= 0) {
5314 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5315 output_mode == UTF_8)) {
5316 if (nkf_isspace(c)) {
5318 if (mimeout_mode == -1) {
5321 if (c==CR || c==LF) {
5323 open_mime(output_mode);
5329 for (i=0;i<mimeout_state.count;i++) {
5330 (*o_mputc)(mimeout_state.buf[i]);
5331 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
5342 mimeout_state.buf[0] = (char)c;
5343 mimeout_state.count = 1;
5345 if (base64_count > 1
5346 && base64_count + mimeout_state.count > 76
5347 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
5348 static const char *str = "boundary=\"";
5349 static int len = 10;
5352 for (; i < mimeout_state.count - len; ++i) {
5353 if (!strncmp((char *)(mimeout_state.buf+i), str, len)) {
5359 if (i == 0 || i == mimeout_state.count - len) {
5360 put_newline(o_mputc);
5362 if (!nkf_isspace(mimeout_state.buf[0])){
5369 for (j = 0; j <= i; ++j) {
5370 (*o_mputc)(mimeout_state.buf[j]);
5372 put_newline(o_mputc);
5374 for (; j <= mimeout_state.count; ++j) {
5375 mimeout_state.buf[j - i] = mimeout_state.buf[j];
5377 mimeout_state.count -= i;
5380 mimeout_state.buf[mimeout_state.count++] = (char)c;
5381 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5382 open_mime(output_mode);
5387 if (lastchar==CR || lastchar == LF){
5388 for (i=0;i<mimeout_state.count;i++) {
5389 (*o_mputc)(mimeout_state.buf[i]);
5392 mimeout_state.count = 0;
5395 for (i=0;i<mimeout_state.count-1;i++) {
5396 (*o_mputc)(mimeout_state.buf[i]);
5399 mimeout_state.buf[0] = SP;
5400 mimeout_state.count = 1;
5402 open_mime(output_mode);
5405 /* mimeout_mode == 'B', 1, 2 */
5406 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5407 output_mode == UTF_8)) {
5408 if (lastchar == CR || lastchar == LF){
5409 if (nkf_isblank(c)) {
5410 for (i=0;i<mimeout_state.count;i++) {
5411 mimeout_addchar(mimeout_state.buf[i]);
5413 mimeout_state.count = 0;
5416 for (i=0;i<mimeout_state.count;i++) {
5417 (*o_mputc)(mimeout_state.buf[i]);
5420 mimeout_state.count = 0;
5422 mimeout_state.buf[mimeout_state.count++] = (char)c;
5425 if (nkf_isspace(c)) {
5426 for (i=0;i<mimeout_state.count;i++) {
5427 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
5429 for (i=0;i<mimeout_state.count;i++) {
5430 (*o_mputc)(mimeout_state.buf[i]);
5433 mimeout_state.count = 0;
5436 mimeout_state.buf[mimeout_state.count++] = (char)c;
5437 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5439 for (j=0;i<mimeout_state.count;j++) {
5440 (*o_mputc)(mimeout_state.buf[j]);
5443 mimeout_state.count = 0;
5447 if (mimeout_state.count>0 && SP<c && c!='=') {
5448 mimeout_state.buf[mimeout_state.count++] = (char)c;
5449 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5450 j = mimeout_state.count;
5451 mimeout_state.count = 0;
5453 mimeout_addchar(mimeout_state.buf[i]);
5460 if (mimeout_state.count>0) {
5461 j = mimeout_state.count;
5462 mimeout_state.count = 0;
5464 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5466 mimeout_addchar(mimeout_state.buf[i]);
5472 (*o_mputc)(mimeout_state.buf[i]);
5474 open_mime(output_mode);
5481 base64_conv(nkf_char c2, nkf_char c1)
5483 mime_prechar(c2, c1);
5484 (*o_base64conv)(c2,c1);
5488 typedef struct nkf_iconv_t {
5491 size_t input_buffer_size;
5492 char *output_buffer;
5493 size_t output_buffer_size;
5497 nkf_iconv_new(char *tocode, char *fromcode)
5499 nkf_iconv_t converter;
5501 converter->input_buffer_size = IOBUF_SIZE;
5502 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5503 converter->output_buffer_size = IOBUF_SIZE * 2;
5504 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5505 converter->cd = iconv_open(tocode, fromcode);
5506 if (converter->cd == (iconv_t)-1)
5510 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5513 perror("can't iconv_open");
5519 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5521 size_t invalid = (size_t)0;
5522 char *input_buffer = converter->input_buffer;
5523 size_t input_length = (size_t)0;
5524 char *output_buffer = converter->output_buffer;
5525 size_t output_length = converter->output_buffer_size;
5530 while ((c = (*i_getc)(f)) != EOF) {
5531 input_buffer[input_length++] = c;
5532 if (input_length < converter->input_buffer_size) break;
5536 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5537 while (output_length-- > 0) {
5538 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5540 if (ret == (size_t) - 1) {
5543 if (input_buffer != converter->input_buffer)
5544 memmove(converter->input_buffer, input_buffer, input_length);
5547 converter->output_buffer_size *= 2;
5548 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5549 if (output_buffer == NULL) {
5550 perror("can't realloc");
5553 converter->output_buffer = output_buffer;
5556 perror("can't iconv");
5569 nkf_iconv_close(nkf_iconv_t *convert)
5571 nkf_xfree(converter->inbuf);
5572 nkf_xfree(converter->outbuf);
5573 iconv_close(converter->cd);
5582 struct input_code *p = input_code_list;
5594 mime_f = MIME_DECODE_DEFAULT;
5595 mime_decode_f = FALSE;
5600 x0201_f = NKF_UNSPECIFIED;
5601 iso2022jp_f = FALSE;
5602 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5603 ms_ucs_map_f = UCS_MAP_ASCII;
5605 #ifdef UTF8_INPUT_ENABLE
5606 no_cp932ext_f = FALSE;
5607 no_best_fit_chars_f = FALSE;
5608 encode_fallback = NULL;
5609 unicode_subchar = '?';
5610 input_endian = ENDIAN_BIG;
5612 #ifdef UTF8_OUTPUT_ENABLE
5613 output_bom_f = FALSE;
5614 output_endian = ENDIAN_BIG;
5616 #ifdef UNICODE_NORMALIZATION
5632 #ifdef SHIFTJIS_CP932
5642 for (i = 0; i < 256; i++){
5643 prefix_table[i] = 0;
5647 mimeout_state.count = 0;
5652 fold_preserve_f = FALSE;
5655 kanji_intro = DEFAULT_J;
5656 ascii_intro = DEFAULT_R;
5657 fold_margin = FOLD_MARGIN;
5658 o_zconv = no_connection;
5659 o_fconv = no_connection;
5660 o_eol_conv = no_connection;
5661 o_rot_conv = no_connection;
5662 o_hira_conv = no_connection;
5663 o_base64conv = no_connection;
5664 o_iso2022jp_check_conv = no_connection;
5667 i_ungetc = std_ungetc;
5669 i_bungetc = std_ungetc;
5672 i_mungetc = std_ungetc;
5673 i_mgetc_buf = std_getc;
5674 i_mungetc_buf = std_ungetc;
5675 output_mode = ASCII;
5677 mime_decode_mode = FALSE;
5683 z_prev2=0,z_prev1=0;
5685 iconv_for_check = 0;
5687 input_codename = NULL;
5688 input_encoding = NULL;
5689 output_encoding = NULL;
5697 module_connection(void)
5699 if (input_encoding) set_input_encoding(input_encoding);
5700 if (!output_encoding) {
5701 output_encoding = nkf_default_encoding();
5703 if (!output_encoding) {
5704 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5707 set_output_encoding(output_encoding);
5708 oconv = nkf_enc_to_oconv(output_encoding);
5710 if (nkf_enc_unicode_p(output_encoding))
5711 output_mode = UTF_8;
5713 if (x0201_f == NKF_UNSPECIFIED) {
5714 x0201_f = X0201_DEFAULT;
5717 /* replace continuation module, from output side */
5719 /* output redirection */
5721 if (noout_f || guess_f){
5728 if (mimeout_f == TRUE) {
5729 o_base64conv = oconv; oconv = base64_conv;
5731 /* base64_count = 0; */
5734 if (eolmode_f || guess_f) {
5735 o_eol_conv = oconv; oconv = eol_conv;
5738 o_rot_conv = oconv; oconv = rot_conv;
5741 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5744 o_hira_conv = oconv; oconv = hira_conv;
5747 o_fconv = oconv; oconv = fold_conv;
5750 if (alpha_f || x0201_f) {
5751 o_zconv = oconv; oconv = z_conv;
5755 i_ungetc = std_ungetc;
5756 /* input redirection */
5759 i_cgetc = i_getc; i_getc = cap_getc;
5760 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5763 i_ugetc = i_getc; i_getc = url_getc;
5764 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5767 #ifdef NUMCHAR_OPTION
5769 i_ngetc = i_getc; i_getc = numchar_getc;
5770 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5773 #ifdef UNICODE_NORMALIZATION
5775 i_nfc_getc = i_getc; i_getc = nfc_getc;
5776 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5779 if (mime_f && mimebuf_f==FIXED_MIME) {
5780 i_mgetc = i_getc; i_getc = mime_getc;
5781 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5784 i_bgetc = i_getc; i_getc = broken_getc;
5785 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5787 if (input_encoding) {
5788 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5790 set_iconv(FALSE, e_iconv);
5794 struct input_code *p = input_code_list;
5803 Conversion main loop. Code detection only.
5806 #if !defined(PERL_XS) && !defined(WIN32DLL)
5813 module_connection();
5814 while ((c = (*i_getc)(f)) != EOF)
5821 #define NEXT continue /* no output, get next */
5822 #define SKIP c2=0;continue /* no output, get next */
5823 #define MORE c2=c1;continue /* need one more byte */
5824 #define SEND (void)0 /* output c1 and c2, get next */
5825 #define LAST break /* end of loop, go closing */
5826 #define set_input_mode(mode) do { \
5827 input_mode = mode; \
5829 set_input_codename("ISO-2022-JP"); \
5830 debug("ISO-2022-JP"); \
5834 kanji_convert(FILE *f)
5836 nkf_char c1=0, c2=0, c3=0, c4=0;
5837 int shift_mode = 0; /* 0, 1, 2, 3 */
5839 int is_8bit = FALSE;
5841 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5846 output_mode = ASCII;
5848 if (module_connection() < 0) {
5849 #if !defined(PERL_XS) && !defined(WIN32DLL)
5850 fprintf(stderr, "no output encoding given\n");
5856 #ifdef UTF8_INPUT_ENABLE
5857 if(iconv == w_iconv32){
5858 while ((c1 = (*i_getc)(f)) != EOF &&
5859 (c2 = (*i_getc)(f)) != EOF &&
5860 (c3 = (*i_getc)(f)) != EOF &&
5861 (c4 = (*i_getc)(f)) != EOF) {
5862 nkf_char c5, c6, c7, c8;
5863 if (nkf_iconv_utf_32(c1, c2, c3, c4) == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5864 if ((c5 = (*i_getc)(f)) != EOF &&
5865 (c6 = (*i_getc)(f)) != EOF &&
5866 (c7 = (*i_getc)(f)) != EOF &&
5867 (c8 = (*i_getc)(f)) != EOF) {
5868 if (nkf_iconv_utf_32_combine(c1, c2, c3, c4, c5, c6, c7, c8)) {
5873 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5876 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5882 else if (iconv == w_iconv16) {
5883 while ((c1 = (*i_getc)(f)) != EOF &&
5884 (c2 = (*i_getc)(f)) != EOF) {
5885 size_t ret = nkf_iconv_utf_16(c1, c2, 0, 0);
5886 if (ret == NKF_ICONV_NEED_TWO_MORE_BYTES &&
5887 (c3 = (*i_getc)(f)) != EOF &&
5888 (c4 = (*i_getc)(f)) != EOF) {
5889 nkf_iconv_utf_16(c1, c2, c3, c4);
5890 } else if (ret == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5891 if ((c3 = (*i_getc)(f)) != EOF &&
5892 (c4 = (*i_getc)(f)) != EOF) {
5893 if (nkf_iconv_utf_16_combine(c1, c2, c3, c4)) {
5896 nkf_iconv_utf_16_nocombine(c1, c2);
5899 nkf_iconv_utf_16_nocombine(c1, c2);
5907 while ((c1 = (*i_getc)(f)) != EOF) {
5908 #ifdef INPUT_CODE_FIX
5909 if (!input_encoding)
5914 if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
5915 /* in case of 8th bit is on */
5916 if (!estab_f&&!mime_decode_mode) {
5917 /* in case of not established yet */
5918 /* It is still ambiguous */
5919 if (h_conv(f, c2, c1)==EOF) {
5927 /* in case of already established */
5929 /* ignore bogus code */
5937 /* 2nd byte of 7 bit code or SJIS */
5941 else if (nkf_char_unicode_p(c1)) {
5947 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5950 }else if (input_codename && input_codename[0] == 'I' &&
5951 0xA1 <= c1 && c1 <= 0xDF) {
5952 /* JIS X 0201 Katakana in 8bit JIS */
5953 c2 = JIS_X_0201_1976_K;
5956 } else if (c1 > DEL) {
5958 if (!estab_f && !iso8859_f) {
5959 /* not established yet */
5961 } else { /* estab_f==TRUE */
5967 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5968 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5970 c2 = JIS_X_0201_1976_K;
5975 /* already established */
5979 } else if (SP < c1 && c1 < DEL) {
5980 /* in case of Roman characters */
5982 /* output 1 shifted byte */
5986 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5987 /* output 1 shifted byte */
5988 c2 = JIS_X_0201_1976_K;
5991 /* look like bogus code */
5994 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5995 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5996 /* in case of Kanji shifted */
5998 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5999 /* Check MIME code */
6000 if ((c1 = (*i_getc)(f)) == EOF) {
6003 } else if (c1 == '?') {
6004 /* =? is mime conversion start sequence */
6005 if(mime_f == STRICT_MIME) {
6006 /* check in real detail */
6007 if (mime_begin_strict(f) == EOF)
6010 } else if (mime_begin(f) == EOF)
6019 /* normal ASCII code */
6022 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
6025 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
6028 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
6029 if ((c1 = (*i_getc)(f)) == EOF) {
6033 else if (c1 == '&') {
6035 if ((c1 = (*i_getc)(f)) == EOF) {
6041 else if (c1 == '$') {
6043 if ((c1 = (*i_getc)(f)) == EOF) {
6044 /* don't send bogus code
6046 (*oconv)(0, '$'); */
6048 } else if (c1 == '@' || c1 == 'B') {
6050 set_input_mode(JIS_X_0208);
6052 } else if (c1 == '(') {
6054 if ((c1 = (*i_getc)(f)) == EOF) {
6055 /* don't send bogus code
6061 } else if (c1 == '@'|| c1 == 'B') {
6063 set_input_mode(JIS_X_0208);
6066 } else if (c1 == 'D'){
6067 set_input_mode(JIS_X_0212);
6069 #endif /* X0212_ENABLE */
6070 } else if (c1 == 'O' || c1 == 'Q'){
6071 set_input_mode(JIS_X_0213_1);
6073 } else if (c1 == 'P'){
6074 set_input_mode(JIS_X_0213_2);
6077 /* could be some special code */
6084 } else if (broken_f&0x2) {
6085 /* accept any ESC-(-x as broken code ... */
6086 input_mode = JIS_X_0208;
6095 } else if (c1 == '(') {
6097 if ((c1 = (*i_getc)(f)) == EOF) {
6098 /* don't send bogus code
6100 (*oconv)(0, '('); */
6103 else if (c1 == 'I') {
6104 /* JIS X 0201 Katakana */
6105 set_input_mode(JIS_X_0201_1976_K);
6109 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
6110 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
6111 set_input_mode(ASCII);
6114 else if (broken_f&0x2) {
6115 set_input_mode(ASCII);
6124 else if (c1 == '.') {
6126 if ((c1 = (*i_getc)(f)) == EOF) {
6129 else if (c1 == 'A') {
6140 else if (c1 == 'N') {
6143 if (g2 == ISO_8859_1) {
6159 } else if (c1 == ESC && iconv == s_iconv) {
6160 /* ESC in Shift_JIS */
6161 if ((c1 = (*i_getc)(f)) == EOF) {
6164 } else if (c1 == '$') {
6166 if ((c1 = (*i_getc)(f)) == EOF) {
6168 } else if (('E' <= c1 && c1 <= 'G') ||
6169 ('O' <= c1 && c1 <= 'Q')) {
6177 static const nkf_char jphone_emoji_first_table[7] =
6178 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
6179 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
6180 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6181 while (SP <= c1 && c1 <= 'z') {
6182 (*oconv)(0, c1 + c3);
6183 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6199 } else if (c1 == LF || c1 == CR) {
6201 input_mode = ASCII; set_iconv(FALSE, 0);
6203 } else if (mime_decode_f && !mime_decode_mode){
6205 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
6213 } else { /* if (c1 == CR)*/
6214 if ((c1=(*i_getc)(f))!=EOF) {
6218 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
6238 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
6241 if ((c3 = (*i_getc)(f)) != EOF) {
6244 if ((c4 = (*i_getc)(f)) != EOF) {
6246 (*iconv)(c2, c1, c3|c4);
6251 /* 4 bytes UTF-8 (check combining character) */
6252 if ((c3 = (*i_getc)(f)) != EOF) {
6253 if ((c4 = (*i_getc)(f)) != EOF) {
6254 if (w_iconv_combine(c2, c1, 0, c3, c4, 0)) {
6257 w_iconv_nocombine(c2, c1, 0);
6261 w_iconv_nocombine(c2, c1, 0);
6264 w_iconv_nocombine(c2, c1, 0);
6268 /* 3 bytes EUC or UTF-8 */
6269 if ((c3 = (*i_getc)(f)) != EOF) {
6271 if ((*iconv)(c2, c1, c3) == -3) {
6272 /* 6 bytes UTF-8 (check combining character) */
6274 if ((c4 = (*i_getc)(f)) != EOF) {
6275 if ((c5 = (*i_getc)(f)) != EOF) {
6276 if ((c6 = (*i_getc)(f)) != EOF) {
6277 if (w_iconv_combine(c2, c1, c3, c4, c5, c6)) {
6281 w_iconv_nocombine(c2, c1, c3);
6286 w_iconv_nocombine(c2, c1, c3);
6290 w_iconv_nocombine(c2, c1, c3);
6293 w_iconv_nocombine(c2, c1, c3);
6303 0x7F <= c2 && c2 <= 0x92 &&
6304 0x21 <= c1 && c1 <= 0x7E) {
6306 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
6309 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
6313 (*oconv)(PREFIX_EUCG3 | c2, c1);
6315 #endif /* X0212_ENABLE */
6317 (*oconv)(PREFIX_EUCG3 | c2, c1);
6320 (*oconv)(input_mode, c1); /* other special case */
6326 /* goto next_word */
6331 (*iconv)(EOF, 0, 0);
6332 if (!input_codename)
6335 struct input_code *p = input_code_list;
6336 struct input_code *result = p;
6338 if (p->score < result->score) result = p;
6341 set_input_codename(result->name);
6343 debug(result->name);
6351 * int options(unsigned char *cp)
6358 options(unsigned char *cp)
6362 unsigned char *cp_back = NULL;
6367 while(*cp && *cp++!='-');
6368 while (*cp || cp_back) {
6376 case '-': /* literal options */
6377 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
6381 for (i=0;i<(int)(sizeof(long_option)/sizeof(long_option[0]));i++) {
6382 p = (unsigned char *)long_option[i].name;
6383 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
6384 if (*p == cp[j] || cp[j] == SP){
6391 #if !defined(PERL_XS) && !defined(WIN32DLL)
6392 fprintf(stderr, "unknown long option: --%s\n", cp);
6396 while(*cp && *cp != SP && cp++);
6397 if (long_option[i].alias[0]){
6399 cp = (unsigned char *)long_option[i].alias;
6402 if (strcmp(long_option[i].name, "help") == 0){
6407 if (strcmp(long_option[i].name, "ic=") == 0){
6408 enc = nkf_enc_find((char *)p);
6410 input_encoding = enc;
6413 if (strcmp(long_option[i].name, "oc=") == 0){
6414 enc = nkf_enc_find((char *)p);
6415 /* if (enc <= 0) continue; */
6417 output_encoding = enc;
6420 if (strcmp(long_option[i].name, "guess=") == 0){
6421 if (p[0] == '0' || p[0] == '1') {
6429 if (strcmp(long_option[i].name, "overwrite") == 0){
6432 preserve_time_f = TRUE;
6435 if (strcmp(long_option[i].name, "overwrite=") == 0){
6438 preserve_time_f = TRUE;
6440 backup_suffix = (char *)p;
6443 if (strcmp(long_option[i].name, "in-place") == 0){
6446 preserve_time_f = FALSE;
6449 if (strcmp(long_option[i].name, "in-place=") == 0){
6452 preserve_time_f = FALSE;
6454 backup_suffix = (char *)p;
6459 if (strcmp(long_option[i].name, "cap-input") == 0){
6463 if (strcmp(long_option[i].name, "url-input") == 0){
6468 #ifdef NUMCHAR_OPTION
6469 if (strcmp(long_option[i].name, "numchar-input") == 0){
6475 if (strcmp(long_option[i].name, "no-output") == 0){
6479 if (strcmp(long_option[i].name, "debug") == 0){
6484 if (strcmp(long_option[i].name, "cp932") == 0){
6485 #ifdef SHIFTJIS_CP932
6489 #ifdef UTF8_OUTPUT_ENABLE
6490 ms_ucs_map_f = UCS_MAP_CP932;
6494 if (strcmp(long_option[i].name, "no-cp932") == 0){
6495 #ifdef SHIFTJIS_CP932
6499 #ifdef UTF8_OUTPUT_ENABLE
6500 ms_ucs_map_f = UCS_MAP_ASCII;
6504 #ifdef SHIFTJIS_CP932
6505 if (strcmp(long_option[i].name, "cp932inv") == 0){
6512 if (strcmp(long_option[i].name, "x0212") == 0){
6519 if (strcmp(long_option[i].name, "exec-in") == 0){
6523 if (strcmp(long_option[i].name, "exec-out") == 0){
6528 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
6529 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
6530 no_cp932ext_f = TRUE;
6533 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
6534 no_best_fit_chars_f = TRUE;
6537 if (strcmp(long_option[i].name, "fb-skip") == 0){
6538 encode_fallback = NULL;
6541 if (strcmp(long_option[i].name, "fb-html") == 0){
6542 encode_fallback = encode_fallback_html;
6545 if (strcmp(long_option[i].name, "fb-xml") == 0){
6546 encode_fallback = encode_fallback_xml;
6549 if (strcmp(long_option[i].name, "fb-java") == 0){
6550 encode_fallback = encode_fallback_java;
6553 if (strcmp(long_option[i].name, "fb-perl") == 0){
6554 encode_fallback = encode_fallback_perl;
6557 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6558 encode_fallback = encode_fallback_subchar;
6561 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6562 encode_fallback = encode_fallback_subchar;
6563 unicode_subchar = 0;
6565 /* decimal number */
6566 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6567 unicode_subchar *= 10;
6568 unicode_subchar += hex2bin(p[i]);
6570 }else if(p[1] == 'x' || p[1] == 'X'){
6571 /* hexadecimal number */
6572 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6573 unicode_subchar <<= 4;
6574 unicode_subchar |= hex2bin(p[i]);
6578 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6579 unicode_subchar *= 8;
6580 unicode_subchar += hex2bin(p[i]);
6583 w16e_conv(unicode_subchar, &i, &j);
6584 unicode_subchar = i<<8 | j;
6588 #ifdef UTF8_OUTPUT_ENABLE
6589 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6590 ms_ucs_map_f = UCS_MAP_MS;
6594 #ifdef UNICODE_NORMALIZATION
6595 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6600 if (strcmp(long_option[i].name, "prefix=") == 0){
6601 if (nkf_isgraph(p[0])){
6602 for (i = 1; nkf_isgraph(p[i]); i++){
6603 prefix_table[p[i]] = p[0];
6608 #if !defined(PERL_XS) && !defined(WIN32DLL)
6609 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6614 case 'b': /* buffered mode */
6617 case 'u': /* non bufferd mode */
6620 case 't': /* transparent mode */
6625 } else if (*cp=='2') {
6629 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6637 case 'j': /* JIS output */
6639 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6641 case 'e': /* AT&T EUC output */
6642 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6644 case 's': /* SJIS output */
6645 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6647 case 'l': /* ISO8859 Latin-1 support, no conversion */
6648 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6649 input_encoding = nkf_enc_from_index(ISO_8859_1);
6651 case 'i': /* Kanji IN ESC-$-@/B */
6652 if (*cp=='@'||*cp=='B')
6653 kanji_intro = *cp++;
6655 case 'o': /* ASCII IN ESC-(-J/B/H */
6656 /* ESC ( H was used in initial JUNET messages */
6657 if (*cp=='J'||*cp=='B'||*cp=='H')
6658 ascii_intro = *cp++;
6662 bit:1 katakana->hiragana
6663 bit:2 hiragana->katakana
6665 if ('9'>= *cp && *cp>='0')
6666 hira_f |= (*cp++ -'0');
6673 #if defined(MSDOS) || defined(__OS2__)
6680 show_configuration();
6688 #ifdef UTF8_OUTPUT_ENABLE
6689 case 'w': /* UTF-{8,16,32} output */
6694 output_encoding = nkf_enc_from_index(UTF_8N);
6696 output_bom_f = TRUE;
6697 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6701 if ('1'== cp[0] && '6'==cp[1]) {
6704 } else if ('3'== cp[0] && '2'==cp[1]) {
6708 output_encoding = nkf_enc_from_index(UTF_8);
6713 output_endian = ENDIAN_LITTLE;
6714 output_bom_f = TRUE;
6715 } else if (cp[0] == 'B') {
6717 output_bom_f = TRUE;
6720 output_bom_f = FALSE;
6722 enc_idx = enc_idx == UTF_16
6723 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6724 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6726 enc_idx = enc_idx == UTF_16
6727 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6728 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6730 output_encoding = nkf_enc_from_index(enc_idx);
6734 #ifdef UTF8_INPUT_ENABLE
6735 case 'W': /* UTF input */
6738 input_encoding = nkf_enc_from_index(UTF_8);
6741 if ('1'== cp[0] && '6'==cp[1]) {
6743 input_endian = ENDIAN_BIG;
6745 } else if ('3'== cp[0] && '2'==cp[1]) {
6747 input_endian = ENDIAN_BIG;
6750 input_encoding = nkf_enc_from_index(UTF_8);
6755 input_endian = ENDIAN_LITTLE;
6756 } else if (cp[0] == 'B') {
6758 input_endian = ENDIAN_BIG;
6760 enc_idx = (enc_idx == UTF_16
6761 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6762 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6763 input_encoding = nkf_enc_from_index(enc_idx);
6767 /* Input code assumption */
6768 case 'J': /* ISO-2022-JP input */
6769 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6771 case 'E': /* EUC-JP input */
6772 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6774 case 'S': /* Shift_JIS input */
6775 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6777 case 'Z': /* Convert X0208 alphabet to asii */
6779 bit:0 Convert JIS X 0208 Alphabet to ASCII
6780 bit:1 Convert Kankaku to one space
6781 bit:2 Convert Kankaku to two spaces
6782 bit:3 Convert HTML Entity
6783 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6785 while ('0'<= *cp && *cp <='4') {
6786 alpha_f |= 1 << (*cp++ - '0');
6790 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6791 x0201_f = FALSE; /* No X0201->X0208 conversion */
6793 ESC-(-I in JIS, EUC, MS Kanji
6794 SI/SO in JIS, EUC, MS Kanji
6795 SS2 in EUC, JIS, not in MS Kanji
6796 MS Kanji (0xa0-0xdf)
6798 ESC-(-I in JIS (0x20-0x5f)
6799 SS2 in EUC (0xa0-0xdf)
6800 0xa0-0xd in MS Kanji (0xa0-0xdf)
6803 case 'X': /* Convert X0201 kana to X0208 */
6806 case 'F': /* prserve new lines */
6807 fold_preserve_f = TRUE;
6808 case 'f': /* folding -f60 or -f */
6811 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6813 fold_len += *cp++ - '0';
6815 if (!(0<fold_len && fold_len<BUFSIZ))
6816 fold_len = DEFAULT_FOLD;
6820 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6822 fold_margin += *cp++ - '0';
6826 case 'm': /* MIME support */
6827 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6828 if (*cp=='B'||*cp=='Q') {
6829 mime_decode_mode = *cp++;
6830 mimebuf_f = FIXED_MIME;
6831 } else if (*cp=='N') {
6832 mime_f = TRUE; cp++;
6833 } else if (*cp=='S') {
6834 mime_f = STRICT_MIME; cp++;
6835 } else if (*cp=='0') {
6836 mime_decode_f = FALSE;
6837 mime_f = FALSE; cp++;
6839 mime_f = STRICT_MIME;
6842 case 'M': /* MIME output */
6845 mimeout_f = FIXED_MIME; cp++;
6846 } else if (*cp=='Q') {
6848 mimeout_f = FIXED_MIME; cp++;
6853 case 'B': /* Broken JIS support */
6855 bit:1 allow any x on ESC-(-x or ESC-$-x
6856 bit:2 reset to ascii on NL
6858 if ('9'>= *cp && *cp>='0')
6859 broken_f |= 1<<(*cp++ -'0');
6864 case 'O':/* for Output file */
6868 case 'c':/* add cr code */
6871 case 'd':/* delete cr code */
6874 case 'I': /* ISO-2022-JP output */
6877 case 'L': /* line mode */
6878 if (*cp=='u') { /* unix */
6879 eolmode_f = LF; cp++;
6880 } else if (*cp=='m') { /* mac */
6881 eolmode_f = CR; cp++;
6882 } else if (*cp=='w') { /* windows */
6883 eolmode_f = CRLF; cp++;
6884 } else if (*cp=='0') { /* no conversion */
6885 eolmode_f = 0; cp++;
6890 if ('2' <= *cp && *cp <= '9') {
6893 } else if (*cp == '0' || *cp == '1') {
6902 /* module multiple options in a string are allowed for Perl module */
6903 while(*cp && *cp++!='-');
6906 #if !defined(PERL_XS) && !defined(WIN32DLL)
6907 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6909 /* bogus option but ignored */
6917 #include "nkf32dll.c"
6918 #elif defined(PERL_XS)
6919 #else /* WIN32DLL */
6921 main(int argc, char **argv)
6926 char *outfname = NULL;
6929 #ifdef EASYWIN /*Easy Win */
6930 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6932 #ifdef DEFAULT_CODE_LOCALE
6933 setlocale(LC_CTYPE, "");
6937 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6938 cp = (unsigned char *)*argv;
6943 if (pipe(fds) < 0 || (pid = fork()) < 0){
6954 execvp(argv[1], &argv[1]);
6971 int debug_f_back = debug_f;
6974 int exec_f_back = exec_f;
6977 int x0212_f_back = x0212_f;
6979 int x0213_f_back = x0213_f;
6980 int guess_f_back = guess_f;
6982 guess_f = guess_f_back;
6985 debug_f = debug_f_back;
6988 exec_f = exec_f_back;
6990 x0212_f = x0212_f_back;
6991 x0213_f = x0213_f_back;
6994 if (binmode_f == TRUE)
6995 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6996 if (freopen("","wb",stdout) == NULL)
7003 setbuf(stdout, (char *) NULL);
7005 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
7008 if (binmode_f == TRUE)
7009 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7010 if (freopen("","rb",stdin) == NULL) return (-1);
7014 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
7018 kanji_convert(stdin);
7019 if (guess_f) print_guessed_code(NULL);
7023 int is_argument_error = FALSE;
7025 input_codename = NULL;
7028 iconv_for_check = 0;
7030 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
7032 is_argument_error = TRUE;
7040 /* reopen file for stdout */
7041 if (file_out_f == TRUE) {
7044 outfname = nkf_xmalloc(strlen(origfname)
7045 + strlen(".nkftmpXXXXXX")
7047 strcpy(outfname, origfname);
7051 for (i = strlen(outfname); i; --i){
7052 if (outfname[i - 1] == '/'
7053 || outfname[i - 1] == '\\'){
7059 strcat(outfname, "ntXXXXXX");
7061 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
7062 S_IREAD | S_IWRITE);
7064 strcat(outfname, ".nkftmpXXXXXX");
7065 fd = mkstemp(outfname);
7068 || (fd_backup = dup(fileno(stdout))) < 0
7069 || dup2(fd, fileno(stdout)) < 0
7080 outfname = "nkf.out";
7083 if(freopen(outfname, "w", stdout) == NULL) {
7087 if (binmode_f == TRUE) {
7088 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7089 if (freopen("","wb",stdout) == NULL)
7096 if (binmode_f == TRUE)
7097 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7098 if (freopen("","rb",fin) == NULL)
7103 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
7107 char *filename = NULL;
7109 if (nfiles > 1) filename = origfname;
7110 if (guess_f) print_guessed_code(filename);
7116 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7124 if (dup2(fd_backup, fileno(stdout)) < 0){
7127 if (stat(origfname, &sb)) {
7128 fprintf(stderr, "Can't stat %s\n", origfname);
7130 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
7131 if (chmod(outfname, sb.st_mode)) {
7132 fprintf(stderr, "Can't set permission %s\n", outfname);
7135 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
7136 if(preserve_time_f){
7137 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7138 tb[0] = tb[1] = sb.st_mtime;
7139 if (utime(outfname, tb)) {
7140 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7143 tb.actime = sb.st_atime;
7144 tb.modtime = sb.st_mtime;
7145 if (utime(outfname, &tb)) {
7146 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7151 char *backup_filename = get_backup_filename(backup_suffix, origfname);
7153 unlink(backup_filename);
7155 if (rename(origfname, backup_filename)) {
7156 perror(backup_filename);
7157 fprintf(stderr, "Can't rename %s to %s\n",
7158 origfname, backup_filename);
7160 nkf_xfree(backup_filename);
7163 if (unlink(origfname)){
7168 if (rename(outfname, origfname)) {
7170 fprintf(stderr, "Can't rename %s to %s\n",
7171 outfname, origfname);
7173 nkf_xfree(outfname);
7178 if (is_argument_error)
7181 #ifdef EASYWIN /*Easy Win */
7182 if (file_out_f == FALSE)
7183 scanf("%d",&end_check);
7186 #else /* for Other OS */
7187 if (file_out_f == TRUE)
7189 #endif /*Easy Win */
7192 #endif /* WIN32DLL */