2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2013, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.1.4"
24 #define NKF_RELEASE_DATE "2015-12-12"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2015, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
215 {"ISO-2022-JP", ISO_2022_JP},
216 {"ISO2022JP-CP932", CP50220},
217 {"CP50220", CP50220},
218 {"CP50221", CP50221},
219 {"CSISO2022JP", CP50221},
220 {"CP50222", CP50222},
221 {"ISO-2022-JP-1", ISO_2022_JP_1},
222 {"ISO-2022-JP-3", ISO_2022_JP_3},
223 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
224 {"SHIFT_JIS", SHIFT_JIS},
226 {"MS_Kanji", SHIFT_JIS},
228 {"WINDOWS-31J", WINDOWS_31J},
229 {"CSWINDOWS31J", WINDOWS_31J},
230 {"CP932", WINDOWS_31J},
231 {"MS932", WINDOWS_31J},
232 {"CP10001", CP10001},
235 {"EUCJP-NKF", EUCJP_NKF},
236 {"CP51932", CP51932},
237 {"EUC-JP-MS", EUCJP_MS},
238 {"EUCJP-MS", EUCJP_MS},
239 {"EUCJPMS", EUCJP_MS},
240 {"EUC-JP-ASCII", EUCJP_ASCII},
241 {"EUCJP-ASCII", EUCJP_ASCII},
242 {"SHIFT_JISX0213", SHIFT_JISX0213},
243 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
244 {"EUC-JISX0213", EUC_JISX0213},
245 {"EUC-JIS-2004", EUC_JIS_2004},
248 {"UTF-8-BOM", UTF_8_BOM},
249 {"UTF8-MAC", UTF8_MAC},
250 {"UTF-8-MAC", UTF8_MAC},
252 {"UTF-16BE", UTF_16BE},
253 {"UTF-16BE-BOM", UTF_16BE_BOM},
254 {"UTF-16LE", UTF_16LE},
255 {"UTF-16LE-BOM", UTF_16LE_BOM},
257 {"UTF-32BE", UTF_32BE},
258 {"UTF-32BE-BOM", UTF_32BE_BOM},
259 {"UTF-32LE", UTF_32LE},
260 {"UTF-32LE-BOM", UTF_32LE_BOM},
265 #if defined(DEFAULT_CODE_JIS)
266 #define DEFAULT_ENCIDX ISO_2022_JP
267 #elif defined(DEFAULT_CODE_SJIS)
268 #define DEFAULT_ENCIDX SHIFT_JIS
269 #elif defined(DEFAULT_CODE_WINDOWS_31J)
270 #define DEFAULT_ENCIDX WINDOWS_31J
271 #elif defined(DEFAULT_CODE_EUC)
272 #define DEFAULT_ENCIDX EUC_JP
273 #elif defined(DEFAULT_CODE_UTF8)
274 #define DEFAULT_ENCIDX UTF_8
278 #define is_alnum(c) \
279 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
281 /* I don't trust portablity of toupper */
282 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
283 #define nkf_isoctal(c) ('0'<=c && c<='7')
284 #define nkf_isdigit(c) ('0'<=c && c<='9')
285 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
286 #define nkf_isblank(c) (c == SP || c == TAB)
287 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
288 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
289 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
290 #define nkf_isprint(c) (SP<=c && c<='~')
291 #define nkf_isgraph(c) ('!'<=c && c<='~')
292 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
293 ('A'<=c&&c<='F') ? (c-'A'+10) : \
294 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
295 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
296 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
297 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
298 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
299 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
301 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
302 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
304 #define HOLD_SIZE 1024
305 #if defined(INT_IS_SHORT)
306 #define IOBUF_SIZE 2048
308 #define IOBUF_SIZE 16384
311 #define DEFAULT_J 'B'
312 #define DEFAULT_R 'B'
319 /* MIME preprocessor */
321 #ifdef EASYWIN /*Easy Win */
322 extern POINT _BufferSize;
331 void (*status_func)(struct input_code *, nkf_char);
332 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
336 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
337 static nkf_encoding *input_encoding = NULL;
338 static nkf_encoding *output_encoding = NULL;
340 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
342 * 0: Shift_JIS, eucJP-ascii
347 #define UCS_MAP_ASCII 0
349 #define UCS_MAP_CP932 2
350 #define UCS_MAP_CP10001 3
351 static int ms_ucs_map_f = UCS_MAP_ASCII;
353 #ifdef UTF8_INPUT_ENABLE
354 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
355 static int no_cp932ext_f = FALSE;
356 /* ignore ZERO WIDTH NO-BREAK SPACE */
357 static int no_best_fit_chars_f = FALSE;
358 static int input_endian = ENDIAN_BIG;
359 static int input_bom_f = FALSE;
360 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
361 static void (*encode_fallback)(nkf_char c) = NULL;
362 static void w_status(struct input_code *, nkf_char);
364 #ifdef UTF8_OUTPUT_ENABLE
365 static int output_bom_f = FALSE;
366 static int output_endian = ENDIAN_BIG;
369 static void std_putc(nkf_char c);
370 static nkf_char std_getc(FILE *f);
371 static nkf_char std_ungetc(nkf_char c,FILE *f);
373 static nkf_char broken_getc(FILE *f);
374 static nkf_char broken_ungetc(nkf_char c,FILE *f);
376 static nkf_char mime_getc(FILE *f);
378 static void mime_putc(nkf_char c);
382 #if !defined(PERL_XS) && !defined(WIN32DLL)
383 static unsigned char stdibuf[IOBUF_SIZE];
384 static unsigned char stdobuf[IOBUF_SIZE];
387 #define NKF_UNSPECIFIED (-TRUE)
390 static int unbuf_f = FALSE;
391 static int estab_f = FALSE;
392 static int nop_f = FALSE;
393 static int binmode_f = TRUE; /* binary mode */
394 static int rot_f = FALSE; /* rot14/43 mode */
395 static int hira_f = FALSE; /* hira/kata henkan */
396 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
397 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
398 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
399 static int mimebuf_f = FALSE; /* MIME buffered input */
400 static int broken_f = FALSE; /* convert ESC-less broken JIS */
401 static int iso8859_f = FALSE; /* ISO8859 through */
402 static int mimeout_f = FALSE; /* base64 mode */
403 static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */
404 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
406 #ifdef UNICODE_NORMALIZATION
407 static int nfc_f = FALSE;
408 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
409 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
413 static int cap_f = FALSE;
414 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
415 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
417 static int url_f = FALSE;
418 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
419 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
422 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
423 #define CLASS_MASK NKF_INT32_C(0xFF000000)
424 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
425 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
426 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
427 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
428 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
429 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
430 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
431 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
432 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
434 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
436 #ifdef NUMCHAR_OPTION
437 static int numchar_f = FALSE;
438 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
439 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
443 static int noout_f = FALSE;
444 static void no_putc(nkf_char c);
445 static int debug_f = FALSE;
446 static void debug(const char *str);
447 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
450 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
451 static void set_input_codename(const char *codename);
454 static int exec_f = 0;
457 #ifdef SHIFTJIS_CP932
458 /* invert IBM extended characters to others */
459 static int cp51932_f = FALSE;
461 /* invert NEC-selected IBM extended characters to IBM extended characters */
462 static int cp932inv_f = TRUE;
464 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
465 #endif /* SHIFTJIS_CP932 */
467 static int x0212_f = FALSE;
468 static int x0213_f = FALSE;
470 static unsigned char prefix_table[256];
472 static void e_status(struct input_code *, nkf_char);
473 static void s_status(struct input_code *, nkf_char);
475 struct input_code input_code_list[] = {
476 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
477 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
478 #ifdef UTF8_INPUT_ENABLE
479 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
480 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
481 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
483 {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0}
486 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
487 static int base64_count = 0;
489 /* X0208 -> ASCII converter */
492 static int f_line = 0; /* chars in line */
493 static int f_prev = 0;
494 static int fold_preserve_f = FALSE; /* preserve new lines */
495 static int fold_f = FALSE;
496 static int fold_len = 0;
499 static unsigned char kanji_intro = DEFAULT_J;
500 static unsigned char ascii_intro = DEFAULT_R;
504 #define FOLD_MARGIN 10
505 #define DEFAULT_FOLD 60
507 static int fold_margin = FOLD_MARGIN;
509 /* process default */
512 no_connection2(ARG_UNUSED nkf_char c2, ARG_UNUSED nkf_char c1, ARG_UNUSED nkf_char c0)
514 fprintf(stderr,"nkf internal module connection failure.\n");
520 no_connection(nkf_char c2, nkf_char c1)
522 no_connection2(c2,c1,0);
525 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
526 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
528 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
529 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
530 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
531 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
532 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
533 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
534 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
536 /* static redirections */
538 static void (*o_putc)(nkf_char c) = std_putc;
540 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
541 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
543 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
544 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
546 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
548 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
549 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
551 /* for strict mime */
552 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
553 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
556 static int output_mode = ASCII; /* output kanji mode */
557 static int input_mode = ASCII; /* input kanji mode */
558 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
560 /* X0201 / X0208 conversion tables */
562 /* X0201 kana conversion table */
564 static const unsigned char cv[]= {
565 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
566 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
567 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
568 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
569 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
570 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
571 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
572 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
573 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
574 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
575 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
576 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
577 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
578 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
579 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
580 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
584 /* X0201 kana conversion table for daguten */
586 static const unsigned char dv[]= {
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
592 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
593 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
594 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
595 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
596 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
598 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 /* X0201 kana conversion table for han-daguten */
607 static const unsigned char ev[]= {
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
618 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
619 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
626 /* X0201 kana to X0213 conversion table for han-daguten */
628 static const unsigned char ev_x0213[]= {
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
634 0x00,0x00,0x00,0x00,0x25,0x77,0x25,0x78,
635 0x25,0x79,0x25,0x7a,0x25,0x7b,0x00,0x00,
636 0x00,0x00,0x00,0x00,0x25,0x7c,0x00,0x00,
637 0x00,0x00,0x00,0x00,0x25,0x7d,0x00,0x00,
638 0x25,0x7e,0x00,0x00,0x00,0x00,0x00,0x00,
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
648 /* X0208 kigou conversion table */
649 /* 0x8140 - 0x819e */
650 static const unsigned char fv[] = {
652 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
653 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
654 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
656 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
657 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
658 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
660 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
668 static int option_mode = 0;
669 static int file_out_f = FALSE;
671 static int overwrite_f = FALSE;
672 static int preserve_time_f = FALSE;
673 static int backup_f = FALSE;
674 static char *backup_suffix = "";
677 static int eolmode_f = 0; /* CR, LF, CRLF */
678 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
679 static nkf_char prev_cr = 0; /* CR or 0 */
680 #ifdef EASYWIN /*Easy Win */
681 static int end_check;
685 nkf_xmalloc(size_t size)
689 if (size == 0) size = 1;
693 perror("can't malloc");
701 nkf_xrealloc(void *ptr, size_t size)
703 if (size == 0) size = 1;
705 ptr = realloc(ptr, size);
707 perror("can't realloc");
714 #define nkf_xfree(ptr) free(ptr)
717 nkf_str_caseeql(const char *src, const char *target)
720 for (i = 0; src[i] && target[i]; i++) {
721 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
723 if (src[i] || target[i]) return FALSE;
728 nkf_enc_from_index(int idx)
730 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
733 return &nkf_encoding_table[idx];
737 nkf_enc_find_index(const char *name)
740 if (name[0] == 'X' && *(name+1) == '-') name += 2;
741 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
742 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
743 return encoding_name_to_id_table[i].id;
750 nkf_enc_find(const char *name)
753 idx = nkf_enc_find_index(name);
754 if (idx < 0) return 0;
755 return nkf_enc_from_index(idx);
758 #define nkf_enc_name(enc) (enc)->name
759 #define nkf_enc_to_index(enc) (enc)->id
760 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
761 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
762 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
763 #define nkf_enc_asciicompat(enc) (\
764 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
765 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
766 #define nkf_enc_unicode_p(enc) (\
767 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
768 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
769 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
770 #define nkf_enc_cp5022x_p(enc) (\
771 nkf_enc_to_index(enc) == CP50220 ||\
772 nkf_enc_to_index(enc) == CP50221 ||\
773 nkf_enc_to_index(enc) == CP50222)
775 #ifdef DEFAULT_CODE_LOCALE
777 nkf_locale_charmap(void)
779 #ifdef HAVE_LANGINFO_H
780 return nl_langinfo(CODESET);
781 #elif defined(__WIN32__)
783 sprintf(buf, "CP%d", GetACP());
785 #elif defined(__OS2__)
786 # if defined(INT_IS_SHORT)
792 ULONG ulCP[1], ulncp;
793 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
794 if (ulCP[0] == 932 || ulCP[0] == 943)
795 strcpy(buf, "Shift_JIS");
797 sprintf(buf, "CP%lu", ulCP[0]);
805 nkf_locale_encoding(void)
807 nkf_encoding *enc = 0;
808 const char *encname = nkf_locale_charmap();
810 enc = nkf_enc_find(encname);
813 #endif /* DEFAULT_CODE_LOCALE */
816 nkf_utf8_encoding(void)
818 return &nkf_encoding_table[UTF_8];
822 nkf_default_encoding(void)
824 nkf_encoding *enc = 0;
825 #ifdef DEFAULT_CODE_LOCALE
826 enc = nkf_locale_encoding();
827 #elif defined(DEFAULT_ENCIDX)
828 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
830 if (!enc) enc = nkf_utf8_encoding();
841 nkf_buf_new(int length)
843 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
844 buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length);
852 nkf_buf_dispose(nkf_buf_t *buf)
859 #define nkf_buf_length(buf) ((buf)->len)
860 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
863 nkf_buf_at(nkf_buf_t *buf, int index)
865 assert(index <= buf->len);
866 return buf->ptr[index];
870 nkf_buf_clear(nkf_buf_t *buf)
876 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
878 if (buf->capa <= buf->len) {
881 buf->ptr[buf->len++] = c;
885 nkf_buf_pop(nkf_buf_t *buf)
887 assert(!nkf_buf_empty_p(buf));
888 return buf->ptr[--buf->len];
891 /* Normalization Form C */
894 #define fprintf dllprintf
900 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
907 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
908 #ifdef UTF8_OUTPUT_ENABLE
909 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
910 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
913 #ifdef UTF8_INPUT_ENABLE
914 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
915 " UTF option is -W[8,[16,32][B,L]]\n"
917 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
921 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
922 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
923 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
926 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
927 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
928 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
929 " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n"
932 " O Output to File (DEFAULT 'nkf.out')\n"
933 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
936 " --ic=<encoding> Specify the input encoding\n"
937 " --oc=<encoding> Specify the output encoding\n"
938 " --hiragana --katakana Hiragana/Katakana Conversion\n"
939 " --katakana-hiragana Converts each other\n"
943 " --{cap, url}-input Convert hex after ':' or '%%'\n"
945 #ifdef NUMCHAR_OPTION
946 " --numchar-input Convert Unicode Character Reference\n"
948 #ifdef UTF8_INPUT_ENABLE
949 " --fb-{skip, html, xml, perl, java, subchar}\n"
950 " Specify unassigned character's replacement\n"
955 " --in-place[=SUF] Overwrite original files\n"
956 " --overwrite[=SUF] Preserve timestamp of original files\n"
958 " -g --guess Guess the input code\n"
959 " -v --version Print the version\n"
960 " --help/-V Print this help / configuration\n"
966 show_configuration(void)
969 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
970 " Compile-time options:\n"
971 " Compiled at: " __DATE__ " " __TIME__ "\n"
974 " Default output encoding: "
975 #ifdef DEFAULT_CODE_LOCALE
976 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
977 #elif defined(DEFAULT_ENCIDX)
978 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
984 " Default output end of line: "
985 #if DEFAULT_NEWLINE == CR
987 #elif DEFAULT_NEWLINE == CRLF
993 " Decode MIME encoded string: "
994 #if MIME_DECODE_DEFAULT
1000 " Convert JIS X 0201 Katakana: "
1007 " --help, --version output: "
1008 #if HELP_OUTPUT_HELP_OUTPUT
1019 get_backup_filename(const char *suffix, const char *filename)
1021 char *backup_filename;
1022 int asterisk_count = 0;
1024 int filename_length = strlen(filename);
1026 for(i = 0; suffix[i]; i++){
1027 if(suffix[i] == '*') asterisk_count++;
1031 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1032 for(i = 0, j = 0; suffix[i];){
1033 if(suffix[i] == '*'){
1034 backup_filename[j] = '\0';
1035 strncat(backup_filename, filename, filename_length);
1037 j += filename_length;
1039 backup_filename[j++] = suffix[i++];
1042 backup_filename[j] = '\0';
1044 j = filename_length + strlen(suffix);
1045 backup_filename = nkf_xmalloc(j + 1);
1046 strcpy(backup_filename, filename);
1047 strcat(backup_filename, suffix);
1048 backup_filename[j] = '\0';
1050 return backup_filename;
1054 #ifdef UTF8_INPUT_ENABLE
1056 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1061 if(c >= NKF_INT32_C(1)<<shift){
1063 (*f)(0, bin2hex(c>>shift));
1074 encode_fallback_html(nkf_char c)
1079 if(c >= NKF_INT32_C(1000000))
1080 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1081 if(c >= NKF_INT32_C(100000))
1082 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1084 (*oconv)(0, 0x30+(c/10000 )%10);
1086 (*oconv)(0, 0x30+(c/1000 )%10);
1088 (*oconv)(0, 0x30+(c/100 )%10);
1090 (*oconv)(0, 0x30+(c/10 )%10);
1092 (*oconv)(0, 0x30+ c %10);
1098 encode_fallback_xml(nkf_char c)
1103 nkf_each_char_to_hex(oconv, c);
1109 encode_fallback_java(nkf_char c)
1113 if(!nkf_char_unicode_bmp_p(c)){
1117 (*oconv)(0, bin2hex(c>>20));
1118 (*oconv)(0, bin2hex(c>>16));
1122 (*oconv)(0, bin2hex(c>>12));
1123 (*oconv)(0, bin2hex(c>> 8));
1124 (*oconv)(0, bin2hex(c>> 4));
1125 (*oconv)(0, bin2hex(c ));
1130 encode_fallback_perl(nkf_char c)
1135 nkf_each_char_to_hex(oconv, c);
1141 encode_fallback_subchar(nkf_char c)
1143 c = unicode_subchar;
1144 (*oconv)((c>>8)&0xFF, c&0xFF);
1149 static const struct {
1173 {"katakana-hiragana","h3"},
1181 #ifdef UTF8_OUTPUT_ENABLE
1191 {"fb-subchar=", ""},
1193 #ifdef UTF8_INPUT_ENABLE
1194 {"utf8-input", "W"},
1195 {"utf16-input", "W16"},
1196 {"no-cp932ext", ""},
1197 {"no-best-fit-chars",""},
1199 #ifdef UNICODE_NORMALIZATION
1200 {"utf8mac-input", ""},
1212 #ifdef NUMCHAR_OPTION
1213 {"numchar-input", ""},
1219 #ifdef SHIFTJIS_CP932
1230 set_input_encoding(nkf_encoding *enc)
1232 switch (nkf_enc_to_index(enc)) {
1238 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1240 #ifdef SHIFTJIS_CP932
1243 #ifdef UTF8_OUTPUT_ENABLE
1244 ms_ucs_map_f = UCS_MAP_CP932;
1254 case ISO_2022_JP_2004:
1261 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1262 #ifdef SHIFTJIS_CP932
1265 #ifdef UTF8_OUTPUT_ENABLE
1266 ms_ucs_map_f = UCS_MAP_CP932;
1271 #ifdef SHIFTJIS_CP932
1274 #ifdef UTF8_OUTPUT_ENABLE
1275 ms_ucs_map_f = UCS_MAP_CP10001;
1283 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1284 #ifdef SHIFTJIS_CP932
1287 #ifdef UTF8_OUTPUT_ENABLE
1288 ms_ucs_map_f = UCS_MAP_CP932;
1292 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1293 #ifdef SHIFTJIS_CP932
1296 #ifdef UTF8_OUTPUT_ENABLE
1297 ms_ucs_map_f = UCS_MAP_MS;
1301 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1302 #ifdef SHIFTJIS_CP932
1305 #ifdef UTF8_OUTPUT_ENABLE
1306 ms_ucs_map_f = UCS_MAP_ASCII;
1309 case SHIFT_JISX0213:
1310 case SHIFT_JIS_2004:
1312 #ifdef SHIFTJIS_CP932
1314 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1320 #ifdef SHIFTJIS_CP932
1324 #ifdef UTF8_INPUT_ENABLE
1325 #ifdef UNICODE_NORMALIZATION
1333 input_endian = ENDIAN_BIG;
1337 input_endian = ENDIAN_LITTLE;
1342 input_endian = ENDIAN_BIG;
1346 input_endian = ENDIAN_LITTLE;
1353 set_output_encoding(nkf_encoding *enc)
1355 switch (nkf_enc_to_index(enc)) {
1357 #ifdef SHIFTJIS_CP932
1358 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1360 #ifdef UTF8_OUTPUT_ENABLE
1361 ms_ucs_map_f = UCS_MAP_CP932;
1365 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1366 #ifdef SHIFTJIS_CP932
1367 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1369 #ifdef UTF8_OUTPUT_ENABLE
1370 ms_ucs_map_f = UCS_MAP_CP932;
1374 #ifdef SHIFTJIS_CP932
1375 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1380 #ifdef SHIFTJIS_CP932
1381 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1385 case ISO_2022_JP_2004:
1388 #ifdef SHIFTJIS_CP932
1389 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1395 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1396 #ifdef UTF8_OUTPUT_ENABLE
1397 ms_ucs_map_f = UCS_MAP_CP932;
1401 #ifdef UTF8_OUTPUT_ENABLE
1402 ms_ucs_map_f = UCS_MAP_CP10001;
1407 #ifdef SHIFTJIS_CP932
1408 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1410 #ifdef UTF8_OUTPUT_ENABLE
1411 ms_ucs_map_f = UCS_MAP_ASCII;
1416 #ifdef SHIFTJIS_CP932
1417 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1419 #ifdef UTF8_OUTPUT_ENABLE
1420 ms_ucs_map_f = UCS_MAP_ASCII;
1424 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1425 #ifdef SHIFTJIS_CP932
1426 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1428 #ifdef UTF8_OUTPUT_ENABLE
1429 ms_ucs_map_f = UCS_MAP_CP932;
1433 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1435 #ifdef UTF8_OUTPUT_ENABLE
1436 ms_ucs_map_f = UCS_MAP_MS;
1440 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1442 #ifdef UTF8_OUTPUT_ENABLE
1443 ms_ucs_map_f = UCS_MAP_ASCII;
1446 case SHIFT_JISX0213:
1447 case SHIFT_JIS_2004:
1449 #ifdef SHIFTJIS_CP932
1450 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1457 #ifdef SHIFTJIS_CP932
1458 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1461 #ifdef UTF8_OUTPUT_ENABLE
1463 output_bom_f = TRUE;
1467 output_bom_f = TRUE;
1470 output_endian = ENDIAN_LITTLE;
1471 output_bom_f = FALSE;
1474 output_endian = ENDIAN_LITTLE;
1475 output_bom_f = TRUE;
1479 output_bom_f = TRUE;
1482 output_endian = ENDIAN_LITTLE;
1483 output_bom_f = FALSE;
1486 output_endian = ENDIAN_LITTLE;
1487 output_bom_f = TRUE;
1493 static struct input_code*
1494 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1497 struct input_code *p = input_code_list;
1499 if (iconv_func == p->iconv_func){
1509 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1511 #ifdef INPUT_CODE_FIX
1512 if (f || !input_encoding)
1519 #ifdef INPUT_CODE_FIX
1520 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1526 if (estab_f && iconv_for_check != iconv){
1527 struct input_code *p = find_inputcode_byfunc(iconv);
1529 set_input_codename(p->name);
1532 iconv_for_check = iconv;
1539 x0212_shift(nkf_char c)
1544 if (0x75 <= c && c <= 0x7f){
1545 ret = c + (0x109 - 0x75);
1548 if (0x75 <= c && c <= 0x7f){
1549 ret = c + (0x113 - 0x75);
1557 x0212_unshift(nkf_char c)
1560 if (0x7f <= c && c <= 0x88){
1561 ret = c + (0x75 - 0x7f);
1562 }else if (0x89 <= c && c <= 0x92){
1563 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1567 #endif /* X0212_ENABLE */
1570 is_x0213_2_in_x0212(nkf_char c1)
1572 static const char x0213_2_table[] =
1573 {0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1};
1576 return x0213_2_table[ku]; /* 1, 3-5, 8, 12-15 */
1577 if (78 <= ku && ku <= 94)
1583 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1588 if (x0213_f && is_x0213_2_in_x0212(ndx)){
1589 if((0x21 <= ndx && ndx <= 0x2F)){
1590 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1591 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1593 }else if(0x6E <= ndx && ndx <= 0x7E){
1594 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1595 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1601 else if(nkf_isgraph(ndx)){
1603 const unsigned short *ptr;
1604 ptr = x0212_shiftjis[ndx - 0x21];
1606 val = ptr[(c1 & 0x7f) - 0x21];
1615 c2 = x0212_shift(c2);
1617 #endif /* X0212_ENABLE */
1619 if(0x7F < c2) return 1;
1620 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1621 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1626 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1628 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1631 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1632 if (0xFC < c1) return 1;
1633 #ifdef SHIFTJIS_CP932
1634 if (!cp932inv_f && !x0213_f && is_ibmext_in_sjis(c2)){
1635 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1642 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1643 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1649 #endif /* SHIFTJIS_CP932 */
1651 if (!x0213_f && is_ibmext_in_sjis(c2)){
1652 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1655 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1668 if(x0213_f && c2 >= 0xF0){
1669 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1670 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1671 }else{ /* 78<=k<=94 */
1672 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1673 if (0x9E < c1) c2++;
1676 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1677 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1678 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1679 if (0x9E < c1) c2++;
1682 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1689 c2 = x0212_unshift(c2);
1696 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1698 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1706 }else if (val < 0x800){
1707 *p1 = 0xc0 | (val >> 6);
1708 *p2 = 0x80 | (val & 0x3f);
1711 } else if (nkf_char_unicode_bmp_p(val)) {
1712 *p1 = 0xe0 | (val >> 12);
1713 *p2 = 0x80 | ((val >> 6) & 0x3f);
1714 *p3 = 0x80 | ( val & 0x3f);
1716 } else if (nkf_char_unicode_value_p(val)) {
1717 *p1 = 0xf0 | (val >> 18);
1718 *p2 = 0x80 | ((val >> 12) & 0x3f);
1719 *p3 = 0x80 | ((val >> 6) & 0x3f);
1720 *p4 = 0x80 | ( val & 0x3f);
1730 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1737 else if (c1 <= 0xC1) {
1738 /* trail byte or invalid */
1741 else if (c1 <= 0xDF) {
1743 wc = (c1 & 0x1F) << 6;
1746 else if (c1 <= 0xEF) {
1748 wc = (c1 & 0x0F) << 12;
1749 wc |= (c2 & 0x3F) << 6;
1752 else if (c2 <= 0xF4) {
1754 wc = (c1 & 0x0F) << 18;
1755 wc |= (c2 & 0x3F) << 12;
1756 wc |= (c3 & 0x3F) << 6;
1766 #ifdef UTF8_INPUT_ENABLE
1768 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1769 const unsigned short *const *pp, nkf_char psize,
1770 nkf_char *p2, nkf_char *p1)
1773 const unsigned short *p;
1776 if (pp == 0) return 1;
1779 if (c1 < 0 || psize <= c1) return 1;
1781 if (p == 0) return 1;
1784 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1786 if (val == 0) return 1;
1787 if (no_cp932ext_f && (
1788 (val>>8) == 0x2D || /* NEC special characters */
1789 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1797 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1805 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1807 const unsigned short *const *pp;
1808 const unsigned short *const *const *ppp;
1809 static const char no_best_fit_chars_table_C2[] =
1810 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1811 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1812 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1813 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1814 static const char no_best_fit_chars_table_C2_ms[] =
1815 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1816 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1817 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1818 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1819 static const char no_best_fit_chars_table_932_C2[] =
1820 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1821 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1822 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1823 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1824 static const char no_best_fit_chars_table_932_C3[] =
1825 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1826 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1827 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1828 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1834 }else if(c2 < 0xe0){
1835 if(no_best_fit_chars_f){
1836 if(ms_ucs_map_f == UCS_MAP_CP932){
1839 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1842 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1845 }else if(!cp932inv_f){
1848 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1851 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1854 }else if(ms_ucs_map_f == UCS_MAP_MS){
1855 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1856 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1874 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1875 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1876 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1877 x0213_f ? utf8_to_euc_2bytes_x0213 :
1879 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1880 }else if(c0 < 0xF0){
1881 if(no_best_fit_chars_f){
1882 if(ms_ucs_map_f == UCS_MAP_CP932){
1883 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1884 }else if(ms_ucs_map_f == UCS_MAP_MS){
1889 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1892 if(c0 == 0x92) return 1;
1897 if(c1 == 0x80 || c0 == 0x9C) return 1;
1900 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1905 if(c0 == 0x94) return 1;
1908 if(c0 == 0xBB) return 1;
1918 if(c0 == 0x95) return 1;
1921 if(c0 == 0xA5) return 1;
1928 if(c0 == 0x8D) return 1;
1931 if(c0 == 0x9E && !cp932inv_f) return 1;
1934 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1942 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1943 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1944 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1945 x0213_f ? utf8_to_euc_3bytes_x0213 :
1947 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1949 #ifdef SHIFTJIS_CP932
1950 if (!ret&& is_eucg3(*p2)) {
1952 if (encode_fallback) ret = 1;
1956 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1957 s2e_conv(s2, s1, p2, p1);
1967 #ifdef UTF8_OUTPUT_ENABLE
1968 #define X0213_SURROGATE_FIND(tbl, size, euc) do { \
1970 for (i = 0; i < size; i++) \
1971 if (tbl[i][0] == euc) { \
1978 e2w_conv(nkf_char c2, nkf_char c1)
1980 const unsigned short *p;
1982 if (c2 == JIS_X_0201_1976_K) {
1983 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1991 p = euc_to_utf8_1byte;
1993 } else if (is_eucg3(c2)){
1994 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1997 c2 = (c2&0x7f) - 0x21;
1998 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2000 x0213_f ? x0212_to_utf8_2bytes_x0213[c2] :
2001 x0212_to_utf8_2bytes[c2];
2007 c2 = (c2&0x7f) - 0x21;
2008 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2010 x0213_f ? euc_to_utf8_2bytes_x0213[c2] :
2011 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
2012 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
2013 euc_to_utf8_2bytes_ms[c2];
2018 c1 = (c1 & 0x7f) - 0x21;
2019 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte) {
2020 nkf_char val = p[c1];
2021 if (x0213_f && 0xD800<=val && val<=0xDBFF) {
2022 nkf_char euc = (c2+0x21)<<8 | (c1+0x21);
2024 if (p==x0212_to_utf8_2bytes_x0213[c2]) {
2025 X0213_SURROGATE_FIND(x0213_2_surrogate_table, sizeof_x0213_2_surrogate_table, euc);
2027 X0213_SURROGATE_FIND(x0213_1_surrogate_table, sizeof_x0213_1_surrogate_table, euc);
2030 return UTF16_TO_UTF32(val, low);
2039 e2w_combining(nkf_char comb, nkf_char c2, nkf_char c1)
2043 for (i = 0; i < sizeof_x0213_combining_chars; i++)
2044 if (x0213_combining_chars[i] == comb)
2046 if (i >= sizeof_x0213_combining_chars)
2048 euc = (c2&0x7f)<<8 | (c1&0x7f);
2049 for (i = 0; i < sizeof_x0213_combining_table; i++)
2050 if (x0213_combining_table[i][0] == euc)
2051 return x0213_combining_table[i][1];
2057 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
2064 }else if (0xc0 <= c2 && c2 <= 0xef) {
2065 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2066 #ifdef NUMCHAR_OPTION
2069 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
2077 #ifdef UTF8_INPUT_ENABLE
2079 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
2081 nkf_char c1, c2, c3, c4;
2088 else if (nkf_char_unicode_bmp_p(val)){
2089 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2090 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
2093 *p1 = nkf_char_unicode_new(val);
2100 c1 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2101 c2 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2102 for (i = 0; i < sizeof_x0213_1_surrogate_table; i++)
2103 if (x0213_1_surrogate_table[i][1] == c1 && x0213_1_surrogate_table[i][2] == c2) {
2104 val = x0213_1_surrogate_table[i][0];
2109 for (i = 0; i < sizeof_x0213_2_surrogate_table; i++)
2110 if (x0213_2_surrogate_table[i][1] == c1 && x0213_2_surrogate_table[i][2] == c2) {
2111 val = x0213_2_surrogate_table[i][0];
2112 *p2 = PREFIX_EUCG3 | (val >> 8);
2118 *p1 = nkf_char_unicode_new(val);
2125 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2127 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2128 if (iso2022jp_f && !x0201_f) {
2129 c2 = GETA1; c1 = GETA2;
2131 c2 = JIS_X_0201_1976_K;
2135 }else if (c2 == 0x8f){
2139 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2140 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2141 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2144 c2 = (c2 << 8) | (c1 & 0x7f);
2146 #ifdef SHIFTJIS_CP932
2149 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2150 s2e_conv(s2, s1, &c2, &c1);
2157 #endif /* SHIFTJIS_CP932 */
2159 #endif /* X0212_ENABLE */
2160 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2163 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2164 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2165 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2170 #ifdef SHIFTJIS_CP932
2171 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2173 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2174 s2e_conv(s2, s1, &c2, &c1);
2181 #endif /* SHIFTJIS_CP932 */
2189 s_iconv(ARG_UNUSED nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2191 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2192 if (iso2022jp_f && !x0201_f) {
2193 c2 = GETA1; c1 = GETA2;
2197 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2199 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2201 if(c1 == 0x7F) return 0;
2202 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2205 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2206 if (ret) return ret;
2213 x0213_wait_combining_p(nkf_char wc)
2216 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2217 if (x0213_combining_table[i][1] == wc) {
2225 x0213_combining_p(nkf_char wc)
2228 for (i = 0; i < sizeof_x0213_combining_chars; i++) {
2229 if (x0213_combining_chars[i] == wc) {
2237 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2239 nkf_char ret = 0, c4 = 0;
2240 static const char w_iconv_utf8_1st_byte[] =
2242 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2243 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2244 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2245 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2252 if (c1 < 0 || 0xff < c1) {
2253 }else if (c1 == 0) { /* 0 : 1 byte*/
2255 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2258 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2260 if (c2 < 0x80 || 0xBF < c2) return 0;
2263 if (c3 == 0) return -1;
2264 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2269 if (c3 == 0) return -1;
2270 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2274 if (c3 == 0) return -1;
2275 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2279 if (c3 == 0) return -2;
2280 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2284 if (c3 == 0) return -2;
2285 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2289 if (c3 == 0) return -2;
2290 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2298 if (c1 == 0 || c1 == EOF){
2299 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2300 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2303 if (x0213_f && x0213_wait_combining_p(nkf_utf8_to_unicode(c1, c2, c3, c4)))
2305 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2314 w_iconv_nocombine(nkf_char c1, nkf_char c2, nkf_char c3)
2316 /* continue from the line below 'return -3;' in w_iconv() */
2317 nkf_char ret = w2e_conv(c1, c2, c3, &c1, &c2);
2324 #define NKF_ICONV_INVALID_CODE_RANGE -13
2325 #define NKF_ICONV_WAIT_COMBINING_CHAR -14
2326 #define NKF_ICONV_NOT_COMBINED -15
2328 unicode_iconv(nkf_char wc, int nocombine)
2336 }else if ((wc>>11) == 27) {
2337 /* unpaired surrogate */
2338 return NKF_ICONV_INVALID_CODE_RANGE;
2339 }else if (wc < 0xFFFF) {
2340 if (!nocombine && x0213_f && x0213_wait_combining_p(wc))
2341 return NKF_ICONV_WAIT_COMBINING_CHAR;
2342 ret = w16e_conv(wc, &c2, &c1);
2343 if (ret) return ret;
2344 }else if (wc < 0x10FFFF) {
2346 c1 = nkf_char_unicode_new(wc);
2348 return NKF_ICONV_INVALID_CODE_RANGE;
2355 unicode_iconv_combine(nkf_char wc, nkf_char wc2)
2361 return NKF_ICONV_NOT_COMBINED;
2362 }else if ((wc2>>11) == 27) {
2363 /* unpaired surrogate */
2364 return NKF_ICONV_INVALID_CODE_RANGE;
2365 }else if (wc2 < 0xFFFF) {
2366 if (!x0213_combining_p(wc2))
2367 return NKF_ICONV_NOT_COMBINED;
2368 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2369 if (x0213_combining_table[i][1] == wc &&
2370 x0213_combining_table[i][2] == wc2) {
2371 c2 = x0213_combining_table[i][0] >> 8;
2372 c1 = x0213_combining_table[i][0] & 0x7f;
2377 }else if (wc2 < 0x10FFFF) {
2378 return NKF_ICONV_NOT_COMBINED;
2380 return NKF_ICONV_INVALID_CODE_RANGE;
2382 return NKF_ICONV_NOT_COMBINED;
2386 w_iconv_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6)
2389 wc = nkf_utf8_to_unicode(c1, c2, c3, 0);
2390 wc2 = nkf_utf8_to_unicode(c4, c5, c6, 0);
2393 return unicode_iconv_combine(wc, wc2);
2396 #define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1
2397 #define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2
2399 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2408 if (input_endian == ENDIAN_BIG) {
2409 if (0xD8 <= c1 && c1 <= 0xDB) {
2410 if (0xDC <= c3 && c3 <= 0xDF) {
2411 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2412 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2417 if (0xD8 <= c2 && c2 <= 0xDB) {
2418 if (0xDC <= c4 && c4 <= 0xDF) {
2419 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2420 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2426 return (*unicode_iconv)(wc, FALSE);
2430 nkf_iconv_utf_16_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2434 if (input_endian == ENDIAN_BIG) {
2435 if (0xD8 <= c3 && c3 <= 0xDB) {
2436 return NKF_ICONV_NOT_COMBINED;
2442 if (0xD8 <= c2 && c2 <= 0xDB) {
2443 return NKF_ICONV_NOT_COMBINED;
2450 return unicode_iconv_combine(wc, wc2);
2454 nkf_iconv_utf_16_nocombine(nkf_char c1, nkf_char c2)
2457 if (input_endian == ENDIAN_BIG)
2461 return (*unicode_iconv)(wc, TRUE);
2465 w_iconv16(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2468 return 16; /* different from w_iconv32 */
2472 w_iconv32(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2475 return 32; /* different from w_iconv16 */
2479 utf32_to_nkf_char(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2483 switch(input_endian){
2485 wc = c2 << 16 | c3 << 8 | c4;
2488 wc = c3 << 16 | c2 << 8 | c1;
2491 wc = c1 << 16 | c4 << 8 | c3;
2494 wc = c4 << 16 | c1 << 8 | c2;
2497 return NKF_ICONV_INVALID_CODE_RANGE;
2503 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2512 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2516 return (*unicode_iconv)(wc, FALSE);
2520 nkf_iconv_utf_32_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6, nkf_char c7, nkf_char c8)
2524 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2527 wc2 = utf32_to_nkf_char(c5, c6, c7, c8);
2531 return unicode_iconv_combine(wc, wc2);
2535 nkf_iconv_utf_32_nocombine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2539 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2540 return (*unicode_iconv)(wc, TRUE);
2544 #define output_ascii_escape_sequence(mode) do { \
2545 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2548 (*o_putc)(ascii_intro); \
2549 output_mode = mode; \
2554 output_escape_sequence(int mode)
2556 if (output_mode == mode)
2564 case JIS_X_0201_1976_K:
2572 (*o_putc)(kanji_intro);
2597 j_oconv(nkf_char c2, nkf_char c1)
2599 #ifdef NUMCHAR_OPTION
2600 if (c2 == 0 && nkf_char_unicode_p(c1)){
2601 w16e_conv(c1, &c2, &c1);
2602 if (c2 == 0 && nkf_char_unicode_p(c1)){
2603 c2 = c1 & VALUE_MASK;
2604 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2607 c2 = 0x7F + c1 / 94;
2608 c1 = 0x21 + c1 % 94;
2610 if (encode_fallback) (*encode_fallback)(c1);
2617 output_ascii_escape_sequence(ASCII);
2620 else if (c2 == EOF) {
2621 output_ascii_escape_sequence(ASCII);
2624 else if (c2 == ISO_8859_1) {
2625 output_ascii_escape_sequence(ISO_8859_1);
2628 else if (c2 == JIS_X_0201_1976_K) {
2629 output_escape_sequence(JIS_X_0201_1976_K);
2632 } else if (is_eucg3(c2)){
2633 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2634 (*o_putc)(c2 & 0x7f);
2639 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2640 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2641 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2648 e_oconv(nkf_char c2, nkf_char c1)
2650 if (c2 == 0 && nkf_char_unicode_p(c1)){
2651 w16e_conv(c1, &c2, &c1);
2652 if (c2 == 0 && nkf_char_unicode_p(c1)){
2653 c2 = c1 & VALUE_MASK;
2654 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2658 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2659 c1 = 0x21 + c1 % 94;
2662 (*o_putc)((c2 & 0x7f) | 0x080);
2663 (*o_putc)(c1 | 0x080);
2665 (*o_putc)((c2 & 0x7f) | 0x080);
2666 (*o_putc)(c1 | 0x080);
2670 if (encode_fallback) (*encode_fallback)(c1);
2678 } else if (c2 == 0) {
2679 output_mode = ASCII;
2681 } else if (c2 == JIS_X_0201_1976_K) {
2682 output_mode = EUC_JP;
2683 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2684 } else if (c2 == ISO_8859_1) {
2685 output_mode = ISO_8859_1;
2686 (*o_putc)(c1 | 0x080);
2688 } else if (is_eucg3(c2)){
2689 output_mode = EUC_JP;
2690 #ifdef SHIFTJIS_CP932
2693 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2694 s2e_conv(s2, s1, &c2, &c1);
2699 output_mode = ASCII;
2701 }else if (is_eucg3(c2)){
2704 (*o_putc)((c2 & 0x7f) | 0x080);
2705 (*o_putc)(c1 | 0x080);
2708 (*o_putc)((c2 & 0x7f) | 0x080);
2709 (*o_putc)(c1 | 0x080);
2713 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2714 set_iconv(FALSE, 0);
2715 return; /* too late to rescue this char */
2717 output_mode = EUC_JP;
2718 (*o_putc)(c2 | 0x080);
2719 (*o_putc)(c1 | 0x080);
2724 s_oconv(nkf_char c2, nkf_char c1)
2726 #ifdef NUMCHAR_OPTION
2727 if (c2 == 0 && nkf_char_unicode_p(c1)){
2728 w16e_conv(c1, &c2, &c1);
2729 if (c2 == 0 && nkf_char_unicode_p(c1)){
2730 c2 = c1 & VALUE_MASK;
2731 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2734 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2736 c1 += 0x40 + (c1 > 0x3e);
2741 if(encode_fallback)(*encode_fallback)(c1);
2750 } else if (c2 == 0) {
2751 output_mode = ASCII;
2753 } else if (c2 == JIS_X_0201_1976_K) {
2754 output_mode = SHIFT_JIS;
2756 } else if (c2 == ISO_8859_1) {
2757 output_mode = ISO_8859_1;
2758 (*o_putc)(c1 | 0x080);
2760 } else if (is_eucg3(c2)){
2761 output_mode = SHIFT_JIS;
2762 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2768 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2769 set_iconv(FALSE, 0);
2770 return; /* too late to rescue this char */
2772 output_mode = SHIFT_JIS;
2773 e2s_conv(c2, c1, &c2, &c1);
2775 #ifdef SHIFTJIS_CP932
2777 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2778 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2784 #endif /* SHIFTJIS_CP932 */
2787 if (prefix_table[(unsigned char)c1]){
2788 (*o_putc)(prefix_table[(unsigned char)c1]);
2794 #ifdef UTF8_OUTPUT_ENABLE
2795 #define OUTPUT_UTF8(val) do { \
2796 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); \
2798 if (c2) (*o_putc)(c2); \
2799 if (c3) (*o_putc)(c3); \
2800 if (c4) (*o_putc)(c4); \
2804 w_oconv(nkf_char c2, nkf_char c1)
2810 output_bom_f = FALSE;
2821 if (c2 == 0 && nkf_char_unicode_p(c1)){
2822 val = c1 & VALUE_MASK;
2830 val = e2w_conv(c2, c1);
2832 val2 = e2w_combining(val, c2, c1);
2840 #define OUTPUT_UTF16_BYTES(c1, c2) do { \
2841 if (output_endian == ENDIAN_LITTLE){ \
2850 #define OUTPUT_UTF16(val) do { \
2851 if (nkf_char_unicode_bmp_p(val)) { \
2852 c2 = (val >> 8) & 0xff; \
2854 OUTPUT_UTF16_BYTES(c1, c2); \
2856 val &= VALUE_MASK; \
2857 if (val <= UNICODE_MAX) { \
2858 c2 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ \
2859 c1 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ \
2860 OUTPUT_UTF16_BYTES(c2 & 0xff, (c2 >> 8) & 0xff); \
2861 OUTPUT_UTF16_BYTES(c1 & 0xff, (c1 >> 8) & 0xff); \
2867 w_oconv16(nkf_char c2, nkf_char c1)
2870 output_bom_f = FALSE;
2871 OUTPUT_UTF16_BYTES(0xFF, 0xFE);
2879 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2883 val = e2w_conv(c2, c1);
2885 val2 = e2w_combining(val, c2, c1);
2890 OUTPUT_UTF16_BYTES(c1, c2);
2894 #define OUTPUT_UTF32(c) do { \
2895 if (output_endian == ENDIAN_LITTLE){ \
2896 (*o_putc)( (c) & 0xFF); \
2897 (*o_putc)(((c) >> 8) & 0xFF); \
2898 (*o_putc)(((c) >> 16) & 0xFF); \
2902 (*o_putc)(((c) >> 16) & 0xFF); \
2903 (*o_putc)(((c) >> 8) & 0xFF); \
2904 (*o_putc)( (c) & 0xFF); \
2909 w_oconv32(nkf_char c2, nkf_char c1)
2912 output_bom_f = FALSE;
2913 if (output_endian == ENDIAN_LITTLE){
2931 if (c2 == ISO_8859_1) {
2933 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2937 val = e2w_conv(c2, c1);
2939 val2 = e2w_combining(val, c2, c1);
2948 #define SCORE_L2 (1) /* Kanji Level 2 */
2949 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2950 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2951 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2952 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2953 #define SCORE_X0213 (SCORE_X0212 << 1) /* JIS X 0213 */
2954 #define SCORE_NO_EXIST (SCORE_X0213 << 1) /* Undefined Characters */
2955 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2956 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2958 #define SCORE_INIT (SCORE_iMIME)
2960 static const nkf_char score_table_A0[] = {
2963 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2964 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_X0213,
2967 static const nkf_char score_table_F0[] = {
2968 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2969 SCORE_L2, SCORE_DEPEND, SCORE_X0213, SCORE_X0213,
2970 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2971 SCORE_CP932, SCORE_X0213, SCORE_X0213, SCORE_ERROR,
2974 static const nkf_char score_table_8FA0[] = {
2975 0, SCORE_X0213, SCORE_X0212, SCORE_X0213,
2976 SCORE_X0213, SCORE_X0213, SCORE_X0212, SCORE_X0212,
2977 SCORE_X0213, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2978 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2981 static const nkf_char score_table_8FE0[] = {
2982 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2983 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2984 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2985 SCORE_X0212, SCORE_X0212, SCORE_X0213, SCORE_X0213,
2988 static const nkf_char score_table_8FF0[] = {
2989 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0212,
2990 SCORE_X0212, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2991 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2992 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2996 set_code_score(struct input_code *ptr, nkf_char score)
2999 ptr->score |= score;
3004 clr_code_score(struct input_code *ptr, nkf_char score)
3007 ptr->score &= ~score;
3012 code_score(struct input_code *ptr)
3014 nkf_char c2 = ptr->buf[0];
3015 nkf_char c1 = ptr->buf[1];
3017 set_code_score(ptr, SCORE_ERROR);
3018 }else if (c2 == SS2){
3019 set_code_score(ptr, SCORE_KANA);
3020 }else if (c2 == 0x8f){
3021 if ((c1 & 0x70) == 0x20){
3022 set_code_score(ptr, score_table_8FA0[c1 & 0x0f]);
3023 }else if ((c1 & 0x70) == 0x60){
3024 set_code_score(ptr, score_table_8FE0[c1 & 0x0f]);
3025 }else if ((c1 & 0x70) == 0x70){
3026 set_code_score(ptr, score_table_8FF0[c1 & 0x0f]);
3028 set_code_score(ptr, SCORE_X0212);
3030 #ifdef UTF8_OUTPUT_ENABLE
3031 }else if (!e2w_conv(c2, c1)){
3032 set_code_score(ptr, SCORE_NO_EXIST);
3034 }else if ((c2 & 0x70) == 0x20){
3035 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
3036 }else if ((c2 & 0x70) == 0x70){
3037 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
3038 }else if ((c2 & 0x70) >= 0x50){
3039 set_code_score(ptr, SCORE_L2);
3044 status_disable(struct input_code *ptr)
3049 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
3053 status_push_ch(struct input_code *ptr, nkf_char c)
3055 ptr->buf[ptr->index++] = c;
3059 status_clear(struct input_code *ptr)
3066 status_reset(struct input_code *ptr)
3069 ptr->score = SCORE_INIT;
3073 status_reinit(struct input_code *ptr)
3076 ptr->_file_stat = 0;
3080 status_check(struct input_code *ptr, nkf_char c)
3082 if (c <= DEL && estab_f){
3088 s_status(struct input_code *ptr, nkf_char c)
3092 status_check(ptr, c);
3097 }else if (nkf_char_unicode_p(c)){
3099 }else if (0xa1 <= c && c <= 0xdf){
3100 status_push_ch(ptr, SS2);
3101 status_push_ch(ptr, c);
3104 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
3106 status_push_ch(ptr, c);
3107 }else if (0xed <= c && c <= 0xee){
3109 status_push_ch(ptr, c);
3110 #ifdef SHIFTJIS_CP932
3111 }else if (is_ibmext_in_sjis(c)){
3113 status_push_ch(ptr, c);
3114 #endif /* SHIFTJIS_CP932 */
3116 }else if (0xf0 <= c && c <= 0xfc){
3118 status_push_ch(ptr, c);
3119 #endif /* X0212_ENABLE */
3121 status_disable(ptr);
3125 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3126 status_push_ch(ptr, c);
3127 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3131 status_disable(ptr);
3135 #ifdef SHIFTJIS_CP932
3136 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
3137 status_push_ch(ptr, c);
3138 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
3139 set_code_score(ptr, SCORE_CP932);
3144 #endif /* SHIFTJIS_CP932 */
3145 status_disable(ptr);
3148 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3149 status_push_ch(ptr, c);
3150 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3151 set_code_score(ptr, SCORE_CP932);
3154 status_disable(ptr);
3161 e_status(struct input_code *ptr, nkf_char c)
3165 status_check(ptr, c);
3170 }else if (nkf_char_unicode_p(c)){
3172 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
3174 status_push_ch(ptr, c);
3176 }else if (0x8f == c){
3178 status_push_ch(ptr, c);
3179 #endif /* X0212_ENABLE */
3181 status_disable(ptr);
3185 if (0xa1 <= c && c <= 0xfe){
3186 status_push_ch(ptr, c);
3190 status_disable(ptr);
3195 if (0xa1 <= c && c <= 0xfe){
3197 status_push_ch(ptr, c);
3199 status_disable(ptr);
3201 #endif /* X0212_ENABLE */
3205 #ifdef UTF8_INPUT_ENABLE
3207 w_status(struct input_code *ptr, nkf_char c)
3211 status_check(ptr, c);
3216 }else if (nkf_char_unicode_p(c)){
3218 }else if (0xc0 <= c && c <= 0xdf){
3220 status_push_ch(ptr, c);
3221 }else if (0xe0 <= c && c <= 0xef){
3223 status_push_ch(ptr, c);
3224 }else if (0xf0 <= c && c <= 0xf4){
3226 status_push_ch(ptr, c);
3228 status_disable(ptr);
3233 if (0x80 <= c && c <= 0xbf){
3234 status_push_ch(ptr, c);
3235 if (ptr->index > ptr->stat){
3236 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
3237 && ptr->buf[2] == 0xbf);
3238 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
3239 &ptr->buf[0], &ptr->buf[1]);
3246 status_disable(ptr);
3250 if (0x80 <= c && c <= 0xbf){
3251 if (ptr->index < ptr->stat){
3252 status_push_ch(ptr, c);
3257 status_disable(ptr);
3265 code_status(nkf_char c)
3267 int action_flag = 1;
3268 struct input_code *result = 0;
3269 struct input_code *p = input_code_list;
3271 if (!p->status_func) {
3275 if (!p->status_func)
3277 (p->status_func)(p, c);
3280 }else if(p->stat == 0){
3291 if (result && !estab_f){
3292 set_iconv(TRUE, result->iconv_func);
3293 }else if (c <= DEL){
3294 struct input_code *ptr = input_code_list;
3304 nkf_buf_t *std_gc_buf;
3305 nkf_char broken_state;
3306 nkf_buf_t *broken_buf;
3307 nkf_char mimeout_state;
3311 static nkf_state_t *nkf_state = NULL;
3313 #define STD_GC_BUFSIZE (256)
3316 nkf_state_init(void)
3319 nkf_buf_clear(nkf_state->std_gc_buf);
3320 nkf_buf_clear(nkf_state->broken_buf);
3321 nkf_buf_clear(nkf_state->nfc_buf);
3324 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3325 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3326 nkf_state->broken_buf = nkf_buf_new(3);
3327 nkf_state->nfc_buf = nkf_buf_new(9);
3329 nkf_state->broken_state = 0;
3330 nkf_state->mimeout_state = 0;
3337 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3338 return nkf_buf_pop(nkf_state->std_gc_buf);
3345 std_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3347 nkf_buf_push(nkf_state->std_gc_buf, c);
3353 std_putc(nkf_char c)
3360 static nkf_char hold_buf[HOLD_SIZE*2];
3361 static int hold_count = 0;
3363 push_hold_buf(nkf_char c2)
3365 if (hold_count >= HOLD_SIZE*2)
3367 hold_buf[hold_count++] = c2;
3368 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3372 h_conv(FILE *f, nkf_char c1, nkf_char c2)
3379 /** it must NOT be in the kanji shifte sequence */
3380 /** it must NOT be written in JIS7 */
3381 /** and it must be after 2 byte 8bit code */
3387 while ((c2 = (*i_getc)(f)) != EOF) {
3393 if (push_hold_buf(c2) == EOF || estab_f) {
3399 struct input_code *p = input_code_list;
3400 struct input_code *result = p;
3405 if (p->status_func && p->score < result->score) {
3410 set_iconv(TRUE, result->iconv_func);
3415 ** 1) EOF is detected, or
3416 ** 2) Code is established, or
3417 ** 3) Buffer is FULL (but last word is pushed)
3419 ** in 1) and 3) cases, we continue to use
3420 ** Kanji codes by oconv and leave estab_f unchanged.
3425 while (hold_index < hold_count){
3426 c1 = hold_buf[hold_index++];
3427 if (nkf_char_unicode_p(c1)) {
3431 else if (c1 <= DEL){
3434 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3435 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3439 if (hold_index < hold_count){
3440 c2 = hold_buf[hold_index++];
3451 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3454 if (hold_index < hold_count){
3455 c3 = hold_buf[hold_index++];
3456 } else if ((c3 = (*i_getc)(f)) == EOF) {
3461 if (hold_index < hold_count){
3462 c4 = hold_buf[hold_index++];
3463 } else if ((c4 = (*i_getc)(f)) == EOF) {
3468 (*iconv)(c1, c2, (c3<<8)|c4);
3471 /* 4 bytes UTF-8 (check combining character) */
3472 if (hold_index < hold_count){
3473 c3 = hold_buf[hold_index++];
3475 } else if ((c3 = (*i_getc)(f)) == EOF) {
3476 w_iconv_nocombine(c1, c2, 0);
3479 if (hold_index < hold_count){
3480 c4 = hold_buf[hold_index++];
3482 } else if ((c4 = (*i_getc)(f)) == EOF) {
3483 w_iconv_nocombine(c1, c2, 0);
3484 if (fromhold_count <= 2)
3490 if (w_iconv_combine(c1, c2, 0, c3, c4, 0)) {
3491 w_iconv_nocombine(c1, c2, 0);
3492 if (fromhold_count <= 2) {
3495 } else if (fromhold_count == 3) {
3504 /* 3 bytes EUC or UTF-8 */
3505 if (hold_index < hold_count){
3506 c3 = hold_buf[hold_index++];
3508 } else if ((c3 = (*i_getc)(f)) == EOF) {
3514 if ((*iconv)(c1, c2, c3) == -3) {
3515 /* 6 bytes UTF-8 (check combining character) */
3517 if (hold_index < hold_count){
3518 c4 = hold_buf[hold_index++];
3520 } else if ((c4 = (*i_getc)(f)) == EOF) {
3521 w_iconv_nocombine(c1, c2, c3);
3524 if (hold_index < hold_count){
3525 c5 = hold_buf[hold_index++];
3527 } else if ((c5 = (*i_getc)(f)) == EOF) {
3528 w_iconv_nocombine(c1, c2, c3);
3529 if (fromhold_count == 4)
3535 if (hold_index < hold_count){
3536 c6 = hold_buf[hold_index++];
3538 } else if ((c6 = (*i_getc)(f)) == EOF) {
3539 w_iconv_nocombine(c1, c2, c3);
3540 if (fromhold_count == 5) {
3542 } else if (fromhold_count == 4) {
3551 if (w_iconv_combine(c1, c2, c3, c4, c5, c6)) {
3552 w_iconv_nocombine(c1, c2, c3);
3553 if (fromhold_count == 6) {
3555 } else if (fromhold_count == 5) {
3558 } else if (fromhold_count == 4) {
3571 if (c3 == EOF) break;
3577 * Check and Ignore BOM
3583 input_bom_f = FALSE;
3584 switch(c2 = (*i_getc)(f)){
3586 if((c2 = (*i_getc)(f)) == 0x00){
3587 if((c2 = (*i_getc)(f)) == 0xFE){
3588 if((c2 = (*i_getc)(f)) == 0xFF){
3589 if(!input_encoding){
3590 set_iconv(TRUE, w_iconv32);
3592 if (iconv == w_iconv32) {
3594 input_endian = ENDIAN_BIG;
3597 (*i_ungetc)(0xFF,f);
3598 }else (*i_ungetc)(c2,f);
3599 (*i_ungetc)(0xFE,f);
3600 }else if(c2 == 0xFF){
3601 if((c2 = (*i_getc)(f)) == 0xFE){
3602 if(!input_encoding){
3603 set_iconv(TRUE, w_iconv32);
3605 if (iconv == w_iconv32) {
3606 input_endian = ENDIAN_2143;
3609 (*i_ungetc)(0xFF,f);
3610 }else (*i_ungetc)(c2,f);
3611 (*i_ungetc)(0xFF,f);
3612 }else (*i_ungetc)(c2,f);
3613 (*i_ungetc)(0x00,f);
3614 }else (*i_ungetc)(c2,f);
3615 (*i_ungetc)(0x00,f);
3618 if((c2 = (*i_getc)(f)) == 0xBB){
3619 if((c2 = (*i_getc)(f)) == 0xBF){
3620 if(!input_encoding){
3621 set_iconv(TRUE, w_iconv);
3623 if (iconv == w_iconv) {
3627 (*i_ungetc)(0xBF,f);
3628 }else (*i_ungetc)(c2,f);
3629 (*i_ungetc)(0xBB,f);
3630 }else (*i_ungetc)(c2,f);
3631 (*i_ungetc)(0xEF,f);
3634 if((c2 = (*i_getc)(f)) == 0xFF){
3635 if((c2 = (*i_getc)(f)) == 0x00){
3636 if((c2 = (*i_getc)(f)) == 0x00){
3637 if(!input_encoding){
3638 set_iconv(TRUE, w_iconv32);
3640 if (iconv == w_iconv32) {
3641 input_endian = ENDIAN_3412;
3644 (*i_ungetc)(0x00,f);
3645 }else (*i_ungetc)(c2,f);
3646 (*i_ungetc)(0x00,f);
3647 }else (*i_ungetc)(c2,f);
3648 if(!input_encoding){
3649 set_iconv(TRUE, w_iconv16);
3651 if (iconv == w_iconv16) {
3652 input_endian = ENDIAN_BIG;
3656 (*i_ungetc)(0xFF,f);
3657 }else (*i_ungetc)(c2,f);
3658 (*i_ungetc)(0xFE,f);
3661 if((c2 = (*i_getc)(f)) == 0xFE){
3662 if((c2 = (*i_getc)(f)) == 0x00){
3663 if((c2 = (*i_getc)(f)) == 0x00){
3664 if(!input_encoding){
3665 set_iconv(TRUE, w_iconv32);
3667 if (iconv == w_iconv32) {
3668 input_endian = ENDIAN_LITTLE;
3672 (*i_ungetc)(0x00,f);
3673 }else (*i_ungetc)(c2,f);
3674 (*i_ungetc)(0x00,f);
3675 }else (*i_ungetc)(c2,f);
3676 if(!input_encoding){
3677 set_iconv(TRUE, w_iconv16);
3679 if (iconv == w_iconv16) {
3680 input_endian = ENDIAN_LITTLE;
3684 (*i_ungetc)(0xFE,f);
3685 }else (*i_ungetc)(c2,f);
3686 (*i_ungetc)(0xFF,f);
3695 broken_getc(FILE *f)
3699 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3700 return nkf_buf_pop(nkf_state->broken_buf);
3703 if (c=='$' && nkf_state->broken_state != ESC
3704 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3706 nkf_state->broken_state = 0;
3707 if (c1=='@'|| c1=='B') {
3708 nkf_buf_push(nkf_state->broken_buf, c1);
3709 nkf_buf_push(nkf_state->broken_buf, c);
3715 } else if (c=='(' && nkf_state->broken_state != ESC
3716 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3718 nkf_state->broken_state = 0;
3719 if (c1=='J'|| c1=='B') {
3720 nkf_buf_push(nkf_state->broken_buf, c1);
3721 nkf_buf_push(nkf_state->broken_buf, c);
3728 nkf_state->broken_state = c;
3734 broken_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3736 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3737 nkf_buf_push(nkf_state->broken_buf, c);
3742 eol_conv(nkf_char c2, nkf_char c1)
3744 if (guess_f && input_eol != EOF) {
3745 if (c2 == 0 && c1 == LF) {
3746 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3747 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3748 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3750 else if (!input_eol) input_eol = CR;
3751 else if (input_eol != CR) input_eol = EOF;
3753 if (prev_cr || (c2 == 0 && c1 == LF)) {
3755 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3756 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3758 if (c2 == 0 && c1 == CR) prev_cr = CR;
3759 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3763 put_newline(void (*func)(nkf_char))
3765 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3780 oconv_newline(void (*func)(nkf_char, nkf_char))
3782 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3797 Return value of fold_conv()
3799 LF add newline and output char
3800 CR add newline and output nothing
3803 1 (or else) normal output
3805 fold state in prev (previous character)
3807 >0x80 Japanese (X0208/X0201)
3812 This fold algorthm does not preserve heading space in a line.
3813 This is the main difference from fmt.
3816 #define char_size(c2,c1) (c2?2:1)
3819 fold_conv(nkf_char c2, nkf_char c1)
3822 nkf_char fold_state;
3824 if (c1== CR && !fold_preserve_f) {
3825 fold_state=0; /* ignore cr */
3826 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3828 fold_state=0; /* ignore cr */
3829 } else if (c1== BS) {
3830 if (f_line>0) f_line--;
3832 } else if (c2==EOF && f_line != 0) { /* close open last line */
3834 } else if ((c1==LF && !fold_preserve_f)
3835 || ((c1==CR||(c1==LF&&f_prev!=CR))
3836 && fold_preserve_f)) {
3838 if (fold_preserve_f) {
3842 } else if ((f_prev == c1)
3844 ) { /* duplicate newline */
3847 fold_state = LF; /* output two newline */
3853 if (f_prev&0x80) { /* Japanese? */
3855 fold_state = 0; /* ignore given single newline */
3856 } else if (f_prev==SP) {
3860 if (++f_line<=fold_len)
3864 fold_state = CR; /* fold and output nothing */
3868 } else if (c1=='\f') {
3871 fold_state = LF; /* output newline and clear */
3872 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3873 /* X0208 kankaku or ascii space */
3875 fold_state = 0; /* remove duplicate spaces */
3878 if (++f_line<=fold_len)
3879 fold_state = SP; /* output ASCII space only */
3881 f_prev = SP; f_line = 0;
3882 fold_state = CR; /* fold and output nothing */
3886 prev0 = f_prev; /* we still need this one... , but almost done */
3888 if (c2 || c2 == JIS_X_0201_1976_K)
3889 f_prev |= 0x80; /* this is Japanese */
3890 f_line += c2 == JIS_X_0201_1976_K ? 1: char_size(c2,c1);
3891 if (f_line<=fold_len) { /* normal case */
3894 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3895 f_line = char_size(c2,c1);
3896 fold_state = LF; /* We can't wait, do fold now */
3897 } else if (c2 == JIS_X_0201_1976_K) {
3898 /* simple kinsoku rules return 1 means no folding */
3899 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3900 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3901 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3902 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3903 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3904 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3905 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3907 fold_state = LF;/* add one new f_line before this character */
3910 fold_state = LF;/* add one new f_line before this character */
3913 /* kinsoku point in ASCII */
3914 if ( c1==')'|| /* { [ ( */
3925 /* just after special */
3926 } else if (!is_alnum(prev0)) {
3927 f_line = char_size(c2,c1);
3929 } else if ((prev0==SP) || /* ignored new f_line */
3930 (prev0==LF)|| /* ignored new f_line */
3931 (prev0&0x80)) { /* X0208 - ASCII */
3932 f_line = char_size(c2,c1);
3933 fold_state = LF;/* add one new f_line before this character */
3935 fold_state = 1; /* default no fold in ASCII */
3939 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3940 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3941 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3942 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3943 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3944 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3945 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3946 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3947 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3948 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3949 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3950 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3951 /* default no fold in kinsoku */
3954 f_line = char_size(c2,c1);
3955 /* add one new f_line before this character */
3958 f_line = char_size(c2,c1);
3960 /* add one new f_line before this character */
3965 /* terminator process */
3966 switch(fold_state) {
3968 oconv_newline(o_fconv);
3974 oconv_newline(o_fconv);
3985 static nkf_char z_prev2=0,z_prev1=0;
3988 z_conv(nkf_char c2, nkf_char c1)
3991 /* if (c2) c1 &= 0x7f; assertion */
3993 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3999 if (z_prev2 == JIS_X_0201_1976_K) {
4000 if (c2 == JIS_X_0201_1976_K) {
4001 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4003 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4005 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4007 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4009 } else if (x0213_f && c1 == (0xdf&0x7f) && ev_x0213[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4011 (*o_zconv)(ev_x0213[(z_prev1-SP)*2], ev_x0213[(z_prev1-SP)*2+1]);
4016 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4018 if (c2 == JIS_X_0201_1976_K) {
4019 if (dv[(c1-SP)*2] || ev[(c1-SP)*2] || (x0213_f && ev_x0213[(c1-SP)*2])) {
4020 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4025 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4036 if (alpha_f&1 && c2 == 0x23) {
4037 /* JISX0208 Alphabet */
4039 } else if (c2 == 0x21) {
4040 /* JISX0208 Kigou */
4045 } else if (alpha_f&4) {
4050 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4056 if (alpha_f&8 && c2 == 0) {
4058 const char *entity = 0;
4060 case '>': entity = ">"; break;
4061 case '<': entity = "<"; break;
4062 case '\"': entity = """; break;
4063 case '&': entity = "&"; break;
4066 while (*entity) (*o_zconv)(0, *entity++);
4072 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4077 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4081 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4085 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4089 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4093 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4097 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4101 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4105 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4110 (*o_zconv)(JIS_X_0201_1976_K, c);
4113 } else if (c2 == 0x25) {
4114 /* JISX0208 Katakana */
4115 static const int fullwidth_to_halfwidth[] =
4117 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4118 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4119 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4120 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4121 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4122 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4123 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4124 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4125 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4126 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4127 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x365F,
4128 0x375F, 0x385F, 0x395F, 0x3A5F, 0x3E5F, 0x425F, 0x445F, 0x0000
4130 if (fullwidth_to_halfwidth[c1-0x20]){
4131 c2 = fullwidth_to_halfwidth[c1-0x20];
4132 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
4134 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
4138 } else if (c2 == 0 && nkf_char_unicode_p(c1) &&
4139 ((c1&VALUE_MASK) == 0x3099 || (c1&VALUE_MASK) == 0x309A)) { /*
\e$B9g@.MQByE@!&H>ByE@
\e(B */
4140 (*o_zconv)(JIS_X_0201_1976_K, 0x5E + (c1&VALUE_MASK) - 0x3099);
4148 #define rot13(c) ( \
4150 (c <= 'M') ? (c + 13): \
4151 (c <= 'Z') ? (c - 13): \
4153 (c <= 'm') ? (c + 13): \
4154 (c <= 'z') ? (c - 13): \
4158 #define rot47(c) ( \
4160 ( c <= 'O') ? (c + 47) : \
4161 ( c <= '~') ? (c - 47) : \
4166 rot_conv(nkf_char c2, nkf_char c1)
4168 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
4174 (*o_rot_conv)(c2,c1);
4178 hira_conv(nkf_char c2, nkf_char c1)
4182 if (0x20 < c1 && c1 < 0x74) {
4184 (*o_hira_conv)(c2,c1);
4186 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
4188 c1 = nkf_char_unicode_new(0x3094);
4189 (*o_hira_conv)(c2,c1);
4192 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4194 (*o_hira_conv)(c2,c1);
4199 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
4202 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4204 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4208 (*o_hira_conv)(c2,c1);
4213 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4215 #define RANGE_NUM_MAX 18
4216 static const nkf_char range[RANGE_NUM_MAX][2] = {
4237 nkf_char start, end, c;
4239 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4243 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4248 for (i = 0; i < RANGE_NUM_MAX; i++) {
4249 start = range[i][0];
4252 if (c >= start && c <= end) {
4257 (*o_iso2022jp_check_conv)(c2,c1);
4261 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4263 static const unsigned char *mime_pattern[] = {
4264 (const unsigned char *)"\075?EUC-JP?B?",
4265 (const unsigned char *)"\075?SHIFT_JIS?B?",
4266 (const unsigned char *)"\075?ISO-8859-1?Q?",
4267 (const unsigned char *)"\075?ISO-8859-1?B?",
4268 (const unsigned char *)"\075?ISO-2022-JP?B?",
4269 (const unsigned char *)"\075?ISO-2022-JP?B?",
4270 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4271 #if defined(UTF8_INPUT_ENABLE)
4272 (const unsigned char *)"\075?UTF-8?B?",
4273 (const unsigned char *)"\075?UTF-8?Q?",
4275 (const unsigned char *)"\075?US-ASCII?Q?",
4280 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4281 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4282 e_iconv, s_iconv, 0, 0, 0, 0, 0,
4283 #if defined(UTF8_INPUT_ENABLE)
4289 static const nkf_char mime_encode[] = {
4290 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K,
4291 #if defined(UTF8_INPUT_ENABLE)
4298 static const nkf_char mime_encode_method[] = {
4299 'B', 'B','Q', 'B', 'B', 'B', 'Q',
4300 #if defined(UTF8_INPUT_ENABLE)
4308 /* MIME preprocessor fifo */
4310 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
4311 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
4312 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
4314 unsigned char buf[MIME_BUF_SIZE];
4316 unsigned int last; /* decoded */
4317 unsigned int input; /* undecoded */
4319 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
4321 #define MAXRECOVER 20
4324 mime_input_buf_unshift(nkf_char c)
4326 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
4330 mime_ungetc(nkf_char c, ARG_UNUSED FILE *f)
4332 mime_input_buf_unshift(c);
4337 mime_ungetc_buf(nkf_char c, FILE *f)
4340 (*i_mungetc_buf)(c,f);
4342 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
4347 mime_getc_buf(FILE *f)
4349 /* we don't keep eof of mime_input_buf, because it contains ?= as
4350 a terminator. It was checked in mime_integrity. */
4351 return ((mimebuf_f)?
4352 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
4356 switch_mime_getc(void)
4358 if (i_getc!=mime_getc) {
4359 i_mgetc = i_getc; i_getc = mime_getc;
4360 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4361 if(mime_f==STRICT_MIME) {
4362 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4363 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4369 unswitch_mime_getc(void)
4371 if(mime_f==STRICT_MIME) {
4372 i_mgetc = i_mgetc_buf;
4373 i_mungetc = i_mungetc_buf;
4376 i_ungetc = i_mungetc;
4377 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4378 mime_iconv_back = NULL;
4382 mime_integrity(FILE *f, const unsigned char *p)
4386 /* In buffered mode, read until =? or NL or buffer full
4388 mime_input_state.input = mime_input_state.top;
4389 mime_input_state.last = mime_input_state.top;
4391 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
4393 q = mime_input_state.input;
4394 while((c=(*i_getc)(f))!=EOF) {
4395 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
4396 break; /* buffer full */
4398 if (c=='=' && d=='?') {
4399 /* checked. skip header, start decode */
4400 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4401 /* mime_last_input = mime_input_state.input; */
4402 mime_input_state.input = q;
4406 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4408 /* Should we check length mod 4? */
4409 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4412 /* In case of Incomplete MIME, no MIME decode */
4413 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4414 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
4415 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
4416 switch_mime_getc(); /* anyway we need buffered getc */
4421 mime_begin_strict(FILE *f)
4425 const unsigned char *p,*q;
4426 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4428 mime_decode_mode = FALSE;
4429 /* =? has been checked */
4431 p = mime_pattern[j];
4434 for(i=2;p[i]>SP;i++) { /* start at =? */
4435 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4436 /* pattern fails, try next one */
4438 while (mime_pattern[++j]) {
4439 p = mime_pattern[j];
4440 for(k=2;k<i;k++) /* assume length(p) > i */
4441 if (p[k]!=q[k]) break;
4442 if (k==i && nkf_toupper(c1)==p[k]) break;
4444 p = mime_pattern[j];
4445 if (p) continue; /* found next one, continue */
4446 /* all fails, output from recovery buffer */
4454 mime_decode_mode = p[i-2];
4456 mime_iconv_back = iconv;
4457 set_iconv(FALSE, mime_priority_func[j]);
4458 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4460 if (mime_decode_mode=='B') {
4461 mimebuf_f = unbuf_f;
4463 /* do MIME integrity check */
4464 return mime_integrity(f,mime_pattern[j]);
4478 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4479 /* re-read and convert again from mime_buffer. */
4481 /* =? has been checked */
4482 k = mime_input_state.last;
4483 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4484 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4485 /* We accept any character type even if it is breaked by new lines */
4486 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4487 if (c1==LF||c1==SP||c1==CR||
4488 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4490 /* Failed. But this could be another MIME preemble */
4492 mime_input_state.last--;
4498 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4499 if (!(++i<MAXRECOVER) || c1==EOF) break;
4500 if (c1=='b'||c1=='B') {
4501 mime_decode_mode = 'B';
4502 } else if (c1=='q'||c1=='Q') {
4503 mime_decode_mode = 'Q';
4507 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4508 if (!(++i<MAXRECOVER) || c1==EOF) break;
4510 mime_decode_mode = FALSE;
4516 if (!mime_decode_mode) {
4517 /* false MIME premble, restart from mime_buffer */
4518 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4519 /* Since we are in MIME mode until buffer becomes empty, */
4520 /* we never go into mime_begin again for a while. */
4523 /* discard mime preemble, and goto MIME mode */
4524 mime_input_state.last = k;
4525 /* do no MIME integrity check */
4526 return c1; /* used only for checking EOF */
4531 no_putc(ARG_UNUSED nkf_char c)
4537 debug(const char *str)
4540 fprintf(stderr, "%s\n", str ? str : "NULL");
4546 set_input_codename(const char *codename)
4548 if (!input_codename) {
4549 input_codename = codename;
4550 } else if (strcmp(codename, input_codename) != 0) {
4551 input_codename = "";
4556 get_guessed_code(void)
4558 if (input_codename && !*input_codename) {
4559 input_codename = "BINARY";
4561 struct input_code *p = find_inputcode_byfunc(iconv);
4562 if (!input_codename) {
4563 input_codename = "ASCII";
4564 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4565 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4566 input_codename = "CP932";
4567 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4568 if (p->score & SCORE_X0213)
4569 input_codename = "EUC-JIS-2004";
4570 else if (p->score & (SCORE_X0212))
4571 input_codename = "EUCJP-MS";
4572 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4573 input_codename = "CP51932";
4574 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4575 if (p->score & (SCORE_KANA))
4576 input_codename = "CP50221";
4577 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4578 input_codename = "CP50220";
4581 return input_codename;
4584 #if !defined(PERL_XS) && !defined(WIN32DLL)
4586 print_guessed_code(char *filename)
4588 if (filename != NULL) printf("%s: ", filename);
4589 if (input_codename && !*input_codename) {
4592 input_codename = get_guessed_code();
4594 printf("%s\n", input_codename);
4596 printf("%s%s%s%s\n",
4598 iconv != w_iconv16 && iconv != w_iconv32 ? "" :
4599 input_endian == ENDIAN_LITTLE ? " LE" :
4600 input_endian == ENDIAN_BIG ? " BE" :
4602 input_bom_f ? " (BOM)" : "",
4603 input_eol == CR ? " (CR)" :
4604 input_eol == LF ? " (LF)" :
4605 input_eol == CRLF ? " (CRLF)" :
4606 input_eol == EOF ? " (MIXED NL)" :
4616 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4618 nkf_char c1, c2, c3;
4624 if (!nkf_isxdigit(c2)){
4629 if (!nkf_isxdigit(c3)){
4634 return (hex2bin(c2) << 4) | hex2bin(c3);
4640 return hex_getc(':', f, i_cgetc, i_cungetc);
4644 cap_ungetc(nkf_char c, FILE *f)
4646 return (*i_cungetc)(c, f);
4652 return hex_getc('%', f, i_ugetc, i_uungetc);
4656 url_ungetc(nkf_char c, FILE *f)
4658 return (*i_uungetc)(c, f);
4662 #ifdef NUMCHAR_OPTION
4664 numchar_getc(FILE *f)
4666 nkf_char (*g)(FILE *) = i_ngetc;
4667 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4678 if (buf[i] == 'x' || buf[i] == 'X'){
4679 for (j = 0; j < 7; j++){
4681 if (!nkf_isxdigit(buf[i])){
4688 c |= hex2bin(buf[i]);
4691 for (j = 0; j < 8; j++){
4695 if (!nkf_isdigit(buf[i])){
4702 c += hex2bin(buf[i]);
4708 return nkf_char_unicode_new(c);
4718 numchar_ungetc(nkf_char c, FILE *f)
4720 return (*i_nungetc)(c, f);
4724 #ifdef UNICODE_NORMALIZATION
4729 nkf_char (*g)(FILE *f) = i_nfc_getc;
4730 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4731 nkf_buf_t *buf = nkf_state->nfc_buf;
4732 const unsigned char *array;
4733 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4734 nkf_char c = (*g)(f);
4736 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4738 nkf_buf_push(buf, c);
4740 while (lower <= upper) {
4741 int mid = (lower+upper) / 2;
4743 array = normalization_table[mid].nfd;
4744 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4745 if (len >= nkf_buf_length(buf)) {
4749 lower = 1, upper = 0;
4752 nkf_buf_push(buf, c);
4754 if (array[len] != nkf_buf_at(buf, len)) {
4755 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4756 else upper = mid - 1;
4763 array = normalization_table[mid].nfc;
4765 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4766 nkf_buf_push(buf, array[i]);
4770 } while (lower <= upper);
4772 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4773 c = nkf_buf_pop(buf);
4779 nfc_ungetc(nkf_char c, FILE *f)
4781 return (*i_nfc_ungetc)(c, f);
4783 #endif /* UNICODE_NORMALIZATION */
4787 base64decode(nkf_char c)
4792 i = c - 'A'; /* A..Z 0-25 */
4793 } else if (c == '_') {
4794 i = '?' /* 63 */ ; /* _ 63 */
4796 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4798 } else if (c > '/') {
4799 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4800 } else if (c == '+' || c == '-') {
4801 i = '>' /* 62 */ ; /* + and - 62 */
4803 i = '?' /* 63 */ ; /* / 63 */
4811 nkf_char c1, c2, c3, c4, cc;
4812 nkf_char t1, t2, t3, t4, mode, exit_mode;
4813 nkf_char lwsp_count;
4816 nkf_char lwsp_size = 128;
4818 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4819 return mime_input_buf(mime_input_state.top++);
4821 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4822 mime_decode_mode=FALSE;
4823 unswitch_mime_getc();
4824 return (*i_getc)(f);
4827 if (mimebuf_f == FIXED_MIME)
4828 exit_mode = mime_decode_mode;
4831 if (mime_decode_mode == 'Q') {
4832 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4834 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4835 if (c1<=SP || DEL<=c1) {
4836 mime_decode_mode = exit_mode; /* prepare for quit */
4839 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4843 mime_decode_mode = exit_mode; /* prepare for quit */
4844 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4845 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4846 /* end Q encoding */
4847 input_mode = exit_mode;
4849 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4850 while ((c1=(*i_getc)(f))!=EOF) {
4855 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4863 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4864 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4879 lwsp_buf[lwsp_count] = (unsigned char)c1;
4880 if (lwsp_count++>lwsp_size){
4882 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4883 lwsp_buf = lwsp_buf_new;
4889 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4891 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4892 i_ungetc(lwsp_buf[lwsp_count],f);
4895 nkf_xfree(lwsp_buf);
4898 if (c1=='='&&c2<SP) { /* this is soft wrap */
4899 while((c1 = (*i_mgetc)(f)) <=SP) {
4900 if (c1 == EOF) return (EOF);
4902 mime_decode_mode = 'Q'; /* still in MIME */
4903 goto restart_mime_q;
4906 mime_decode_mode = 'Q'; /* still in MIME */
4910 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4911 if (c2<=SP) return c2;
4912 mime_decode_mode = 'Q'; /* still in MIME */
4913 return ((hex2bin(c2)<<4) + hex2bin(c3));
4916 if (mime_decode_mode != 'B') {
4917 mime_decode_mode = FALSE;
4918 return (*i_mgetc)(f);
4922 /* Base64 encoding */
4924 MIME allows line break in the middle of
4925 Base64, but we are very pessimistic in decoding
4926 in unbuf mode because MIME encoded code may broken by
4927 less or editor's control sequence (such as ESC-[-K in unbuffered
4928 mode. ignore incomplete MIME.
4930 mode = mime_decode_mode;
4931 mime_decode_mode = exit_mode; /* prepare for quit */
4933 while ((c1 = (*i_mgetc)(f))<=SP) {
4938 if ((c2 = (*i_mgetc)(f))<=SP) {
4941 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4942 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4945 if ((c1 == '?') && (c2 == '=')) {
4948 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4949 while ((c1=(*i_getc)(f))!=EOF) {
4954 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4962 if ((c1=(*i_getc)(f))!=EOF) {
4966 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4981 lwsp_buf[lwsp_count] = (unsigned char)c1;
4982 if (lwsp_count++>lwsp_size){
4984 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4985 lwsp_buf = lwsp_buf_new;
4991 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4993 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4994 i_ungetc(lwsp_buf[lwsp_count],f);
4997 nkf_xfree(lwsp_buf);
5001 if ((c3 = (*i_mgetc)(f))<=SP) {
5004 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5005 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5009 if ((c4 = (*i_mgetc)(f))<=SP) {
5012 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5013 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5017 mime_decode_mode = mode; /* still in MIME sigh... */
5019 /* BASE 64 decoding */
5021 t1 = 0x3f & base64decode(c1);
5022 t2 = 0x3f & base64decode(c2);
5023 t3 = 0x3f & base64decode(c3);
5024 t4 = 0x3f & base64decode(c4);
5025 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5027 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5028 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5030 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5031 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5033 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5038 return mime_input_buf(mime_input_state.top++);
5041 static const char basis_64[] =
5042 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5044 #define MIMEOUT_BUF_LENGTH 74
5046 unsigned char buf[MIMEOUT_BUF_LENGTH+1];
5050 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5053 open_mime(nkf_char mode)
5055 const unsigned char *p;
5058 p = mime_pattern[0];
5059 for(i=0;mime_pattern[i];i++) {
5060 if (mode == mime_encode[i]) {
5061 p = mime_pattern[i];
5065 mimeout_mode = mime_encode_method[i];
5067 if (base64_count>45) {
5068 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
5069 (*o_mputc)(mimeout_state.buf[i]);
5072 put_newline(o_mputc);
5075 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
5079 for (;i<mimeout_state.count;i++) {
5080 if (nkf_isspace(mimeout_state.buf[i])) {
5081 (*o_mputc)(mimeout_state.buf[i]);
5091 j = mimeout_state.count;
5092 mimeout_state.count = 0;
5094 mime_putc(mimeout_state.buf[i]);
5099 mime_prechar(nkf_char c2, nkf_char c1)
5101 if (mimeout_mode > 0){
5103 if (base64_count + mimeout_state.count/3*4> 73){
5104 (*o_base64conv)(EOF,0);
5105 oconv_newline(o_base64conv);
5106 (*o_base64conv)(0,SP);
5110 if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) {
5111 (*o_base64conv)(EOF,0);
5112 oconv_newline(o_base64conv);
5113 (*o_base64conv)(0,SP);
5119 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
5120 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5121 open_mime(output_mode);
5122 (*o_base64conv)(EOF,0);
5123 oconv_newline(o_base64conv);
5124 (*o_base64conv)(0,SP);
5143 switch(mimeout_mode) {
5148 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
5154 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
5159 if (mimeout_mode > 0) {
5160 if (mimeout_f!=FIXED_MIME) {
5162 } else if (mimeout_mode != 'Q')
5168 mimeout_addchar(nkf_char c)
5170 switch(mimeout_mode) {
5175 } else if(!nkf_isalnum(c)) {
5177 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5178 (*o_mputc)(bin2hex((c&0xf)));
5186 nkf_state->mimeout_state=c;
5187 (*o_mputc)(basis_64[c>>2]);
5192 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5193 nkf_state->mimeout_state=c;
5198 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
5199 (*o_mputc)(basis_64[c & 0x3F]);
5211 mime_putc(nkf_char c)
5216 if (mimeout_f == FIXED_MIME){
5217 if (mimeout_mode == 'Q'){
5218 if (base64_count > 71){
5219 if (c!=CR && c!=LF) {
5221 put_newline(o_mputc);
5226 if (base64_count > 71){
5228 put_newline(o_mputc);
5231 if (c == EOF) { /* c==EOF */
5235 if (c != EOF) { /* c==EOF */
5241 /* mimeout_f != FIXED_MIME */
5243 if (c == EOF) { /* c==EOF */
5244 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
5245 j = mimeout_state.count;
5246 mimeout_state.count = 0;
5248 if (mimeout_mode > 0) {
5249 if (!nkf_isblank(mimeout_state.buf[j-1])) {
5251 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
5254 mimeout_addchar(mimeout_state.buf[i]);
5258 mimeout_addchar(mimeout_state.buf[i]);
5262 mimeout_addchar(mimeout_state.buf[i]);
5268 mimeout_addchar(mimeout_state.buf[i]);
5274 if (mimeout_state.count > 0){
5275 lastchar = mimeout_state.buf[mimeout_state.count - 1];
5280 if (mimeout_mode=='Q') {
5281 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5282 if (c == CR || c == LF) {
5287 } else if (c <= SP) {
5289 if (base64_count > 70) {
5290 put_newline(o_mputc);
5293 if (!nkf_isblank(c)) {
5298 if (base64_count > 70) {
5300 put_newline(o_mputc);
5303 open_mime(output_mode);
5305 if (!nkf_noescape_mime(c)) {
5318 if (mimeout_mode <= 0) {
5319 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5320 output_mode == UTF_8)) {
5321 if (nkf_isspace(c)) {
5323 if (mimeout_mode == -1) {
5326 if (c==CR || c==LF) {
5328 open_mime(output_mode);
5334 for (i=0;i<mimeout_state.count;i++) {
5335 (*o_mputc)(mimeout_state.buf[i]);
5336 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
5347 mimeout_state.buf[0] = (char)c;
5348 mimeout_state.count = 1;
5350 if (base64_count > 1
5351 && base64_count + mimeout_state.count > 76
5352 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
5353 static const char *str = "boundary=\"";
5354 static int len = 10;
5357 for (; i < mimeout_state.count - len; ++i) {
5358 if (!strncmp((char *)(mimeout_state.buf+i), str, len)) {
5364 if (i == 0 || i == mimeout_state.count - len) {
5365 put_newline(o_mputc);
5367 if (!nkf_isspace(mimeout_state.buf[0])){
5374 for (j = 0; j <= i; ++j) {
5375 (*o_mputc)(mimeout_state.buf[j]);
5377 put_newline(o_mputc);
5379 for (; j <= mimeout_state.count; ++j) {
5380 mimeout_state.buf[j - i] = mimeout_state.buf[j];
5382 mimeout_state.count -= i;
5385 mimeout_state.buf[mimeout_state.count++] = (char)c;
5386 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5387 open_mime(output_mode);
5392 if (lastchar==CR || lastchar == LF){
5393 for (i=0;i<mimeout_state.count;i++) {
5394 (*o_mputc)(mimeout_state.buf[i]);
5397 mimeout_state.count = 0;
5400 for (i=0;i<mimeout_state.count-1;i++) {
5401 (*o_mputc)(mimeout_state.buf[i]);
5404 mimeout_state.buf[0] = SP;
5405 mimeout_state.count = 1;
5407 open_mime(output_mode);
5410 /* mimeout_mode == 'B', 1, 2 */
5411 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5412 output_mode == UTF_8)) {
5413 if (lastchar == CR || lastchar == LF){
5414 if (nkf_isblank(c)) {
5415 for (i=0;i<mimeout_state.count;i++) {
5416 mimeout_addchar(mimeout_state.buf[i]);
5418 mimeout_state.count = 0;
5421 for (i=0;i<mimeout_state.count;i++) {
5422 (*o_mputc)(mimeout_state.buf[i]);
5425 mimeout_state.count = 0;
5427 mimeout_state.buf[mimeout_state.count++] = (char)c;
5430 if (nkf_isspace(c)) {
5431 for (i=0;i<mimeout_state.count;i++) {
5432 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
5434 for (i=0;i<mimeout_state.count;i++) {
5435 (*o_mputc)(mimeout_state.buf[i]);
5438 mimeout_state.count = 0;
5441 mimeout_state.buf[mimeout_state.count++] = (char)c;
5442 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5444 for (j=0;j<mimeout_state.count;j++) {
5445 (*o_mputc)(mimeout_state.buf[j]);
5448 mimeout_state.count = 0;
5452 if (mimeout_state.count>0 && SP<c && c!='=') {
5453 mimeout_state.buf[mimeout_state.count++] = (char)c;
5454 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5455 j = mimeout_state.count;
5456 mimeout_state.count = 0;
5458 mimeout_addchar(mimeout_state.buf[i]);
5465 if (mimeout_state.count>0) {
5466 j = mimeout_state.count;
5467 mimeout_state.count = 0;
5469 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5471 mimeout_addchar(mimeout_state.buf[i]);
5477 (*o_mputc)(mimeout_state.buf[i]);
5479 open_mime(output_mode);
5486 base64_conv(nkf_char c2, nkf_char c1)
5488 mime_prechar(c2, c1);
5489 (*o_base64conv)(c2,c1);
5493 typedef struct nkf_iconv_t {
5496 size_t input_buffer_size;
5497 char *output_buffer;
5498 size_t output_buffer_size;
5502 nkf_iconv_new(char *tocode, char *fromcode)
5504 nkf_iconv_t converter;
5506 converter->input_buffer_size = IOBUF_SIZE;
5507 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5508 converter->output_buffer_size = IOBUF_SIZE * 2;
5509 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5510 converter->cd = iconv_open(tocode, fromcode);
5511 if (converter->cd == (iconv_t)-1)
5515 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5518 perror("can't iconv_open");
5524 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5526 size_t invalid = (size_t)0;
5527 char *input_buffer = converter->input_buffer;
5528 size_t input_length = (size_t)0;
5529 char *output_buffer = converter->output_buffer;
5530 size_t output_length = converter->output_buffer_size;
5535 while ((c = (*i_getc)(f)) != EOF) {
5536 input_buffer[input_length++] = c;
5537 if (input_length < converter->input_buffer_size) break;
5541 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5542 while (output_length-- > 0) {
5543 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5545 if (ret == (size_t) - 1) {
5548 if (input_buffer != converter->input_buffer)
5549 memmove(converter->input_buffer, input_buffer, input_length);
5552 converter->output_buffer_size *= 2;
5553 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5554 if (output_buffer == NULL) {
5555 perror("can't realloc");
5558 converter->output_buffer = output_buffer;
5561 perror("can't iconv");
5574 nkf_iconv_close(nkf_iconv_t *convert)
5576 nkf_xfree(converter->inbuf);
5577 nkf_xfree(converter->outbuf);
5578 iconv_close(converter->cd);
5587 struct input_code *p = input_code_list;
5599 mime_f = MIME_DECODE_DEFAULT;
5600 mime_decode_f = FALSE;
5605 x0201_f = NKF_UNSPECIFIED;
5606 iso2022jp_f = FALSE;
5607 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5608 ms_ucs_map_f = UCS_MAP_ASCII;
5610 #ifdef UTF8_INPUT_ENABLE
5611 no_cp932ext_f = FALSE;
5612 no_best_fit_chars_f = FALSE;
5613 encode_fallback = NULL;
5614 unicode_subchar = '?';
5615 input_endian = ENDIAN_BIG;
5617 #ifdef UTF8_OUTPUT_ENABLE
5618 output_bom_f = FALSE;
5619 output_endian = ENDIAN_BIG;
5621 #ifdef UNICODE_NORMALIZATION
5637 #ifdef SHIFTJIS_CP932
5647 for (i = 0; i < 256; i++){
5648 prefix_table[i] = 0;
5652 mimeout_state.count = 0;
5657 fold_preserve_f = FALSE;
5660 kanji_intro = DEFAULT_J;
5661 ascii_intro = DEFAULT_R;
5662 fold_margin = FOLD_MARGIN;
5663 o_zconv = no_connection;
5664 o_fconv = no_connection;
5665 o_eol_conv = no_connection;
5666 o_rot_conv = no_connection;
5667 o_hira_conv = no_connection;
5668 o_base64conv = no_connection;
5669 o_iso2022jp_check_conv = no_connection;
5672 i_ungetc = std_ungetc;
5674 i_bungetc = std_ungetc;
5677 i_mungetc = std_ungetc;
5678 i_mgetc_buf = std_getc;
5679 i_mungetc_buf = std_ungetc;
5680 output_mode = ASCII;
5682 mime_decode_mode = FALSE;
5688 z_prev2=0,z_prev1=0;
5690 iconv_for_check = 0;
5692 input_codename = NULL;
5693 input_encoding = NULL;
5694 output_encoding = NULL;
5702 module_connection(void)
5704 if (input_encoding) set_input_encoding(input_encoding);
5705 if (!output_encoding) {
5706 output_encoding = nkf_default_encoding();
5708 if (!output_encoding) {
5709 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5712 set_output_encoding(output_encoding);
5713 oconv = nkf_enc_to_oconv(output_encoding);
5715 if (nkf_enc_unicode_p(output_encoding))
5716 output_mode = UTF_8;
5718 if (x0201_f == NKF_UNSPECIFIED) {
5719 x0201_f = X0201_DEFAULT;
5722 /* replace continuation module, from output side */
5724 /* output redirection */
5726 if (noout_f || guess_f){
5733 if (mimeout_f == TRUE) {
5734 o_base64conv = oconv; oconv = base64_conv;
5736 /* base64_count = 0; */
5739 if (eolmode_f || guess_f) {
5740 o_eol_conv = oconv; oconv = eol_conv;
5743 o_rot_conv = oconv; oconv = rot_conv;
5746 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5749 o_hira_conv = oconv; oconv = hira_conv;
5752 o_fconv = oconv; oconv = fold_conv;
5755 if (alpha_f || x0201_f) {
5756 o_zconv = oconv; oconv = z_conv;
5760 i_ungetc = std_ungetc;
5761 /* input redirection */
5764 i_cgetc = i_getc; i_getc = cap_getc;
5765 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5768 i_ugetc = i_getc; i_getc = url_getc;
5769 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5772 #ifdef NUMCHAR_OPTION
5774 i_ngetc = i_getc; i_getc = numchar_getc;
5775 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5778 #ifdef UNICODE_NORMALIZATION
5780 i_nfc_getc = i_getc; i_getc = nfc_getc;
5781 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5784 if (mime_f && mimebuf_f==FIXED_MIME) {
5785 i_mgetc = i_getc; i_getc = mime_getc;
5786 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5789 i_bgetc = i_getc; i_getc = broken_getc;
5790 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5792 if (input_encoding) {
5793 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5795 set_iconv(FALSE, e_iconv);
5799 struct input_code *p = input_code_list;
5808 Conversion main loop. Code detection only.
5811 #if !defined(PERL_XS) && !defined(WIN32DLL)
5818 module_connection();
5819 while ((c = (*i_getc)(f)) != EOF)
5826 #define NEXT continue /* no output, get next */
5827 #define SKIP c2=0;continue /* no output, get next */
5828 #define MORE c2=c1;continue /* need one more byte */
5829 #define SEND (void)0 /* output c1 and c2, get next */
5830 #define LAST break /* end of loop, go closing */
5831 #define set_input_mode(mode) do { \
5832 input_mode = mode; \
5834 set_input_codename("ISO-2022-JP"); \
5835 debug("ISO-2022-JP"); \
5839 kanji_convert(FILE *f)
5841 nkf_char c1=0, c2=0, c3=0, c4=0;
5842 int shift_mode = 0; /* 0, 1, 2, 3 */
5844 int is_8bit = FALSE;
5846 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5851 output_mode = ASCII;
5853 if (module_connection() < 0) {
5854 #if !defined(PERL_XS) && !defined(WIN32DLL)
5855 fprintf(stderr, "no output encoding given\n");
5861 #ifdef UTF8_INPUT_ENABLE
5862 if(iconv == w_iconv32){
5863 while ((c1 = (*i_getc)(f)) != EOF &&
5864 (c2 = (*i_getc)(f)) != EOF &&
5865 (c3 = (*i_getc)(f)) != EOF &&
5866 (c4 = (*i_getc)(f)) != EOF) {
5867 nkf_char c5, c6, c7, c8;
5868 if (nkf_iconv_utf_32(c1, c2, c3, c4) == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5869 if ((c5 = (*i_getc)(f)) != EOF &&
5870 (c6 = (*i_getc)(f)) != EOF &&
5871 (c7 = (*i_getc)(f)) != EOF &&
5872 (c8 = (*i_getc)(f)) != EOF) {
5873 if (nkf_iconv_utf_32_combine(c1, c2, c3, c4, c5, c6, c7, c8)) {
5878 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5881 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5887 else if (iconv == w_iconv16) {
5888 while ((c1 = (*i_getc)(f)) != EOF &&
5889 (c2 = (*i_getc)(f)) != EOF) {
5890 size_t ret = nkf_iconv_utf_16(c1, c2, 0, 0);
5891 if (ret == NKF_ICONV_NEED_TWO_MORE_BYTES &&
5892 (c3 = (*i_getc)(f)) != EOF &&
5893 (c4 = (*i_getc)(f)) != EOF) {
5894 nkf_iconv_utf_16(c1, c2, c3, c4);
5895 } else if (ret == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5896 if ((c3 = (*i_getc)(f)) != EOF &&
5897 (c4 = (*i_getc)(f)) != EOF) {
5898 if (nkf_iconv_utf_16_combine(c1, c2, c3, c4)) {
5901 nkf_iconv_utf_16_nocombine(c1, c2);
5904 nkf_iconv_utf_16_nocombine(c1, c2);
5912 while ((c1 = (*i_getc)(f)) != EOF) {
5913 #ifdef INPUT_CODE_FIX
5914 if (!input_encoding)
5919 if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
5920 /* in case of 8th bit is on */
5921 if (!estab_f&&!mime_decode_mode) {
5922 /* in case of not established yet */
5923 /* It is still ambiguous */
5924 if (h_conv(f, c2, c1)==EOF) {
5932 /* in case of already established */
5934 /* ignore bogus code */
5942 /* 2nd byte of 7 bit code or SJIS */
5946 else if (nkf_char_unicode_p(c1)) {
5952 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5955 }else if (input_codename && input_codename[0] == 'I' &&
5956 0xA1 <= c1 && c1 <= 0xDF) {
5957 /* JIS X 0201 Katakana in 8bit JIS */
5958 c2 = JIS_X_0201_1976_K;
5961 } else if (c1 > DEL) {
5963 if (!estab_f && !iso8859_f) {
5964 /* not established yet */
5966 } else { /* estab_f==TRUE */
5972 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5973 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5975 c2 = JIS_X_0201_1976_K;
5980 /* already established */
5984 } else if (SP < c1 && c1 < DEL) {
5985 /* in case of Roman characters */
5987 /* output 1 shifted byte */
5991 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5992 /* output 1 shifted byte */
5993 c2 = JIS_X_0201_1976_K;
5996 /* look like bogus code */
5999 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
6000 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
6001 /* in case of Kanji shifted */
6003 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
6004 /* Check MIME code */
6005 if ((c1 = (*i_getc)(f)) == EOF) {
6008 } else if (c1 == '?') {
6009 /* =? is mime conversion start sequence */
6010 if(mime_f == STRICT_MIME) {
6011 /* check in real detail */
6012 if (mime_begin_strict(f) == EOF)
6015 } else if (mime_begin(f) == EOF)
6024 /* normal ASCII code */
6027 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
6030 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
6033 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
6034 if ((c1 = (*i_getc)(f)) == EOF) {
6038 else if (c1 == '&') {
6040 if ((c1 = (*i_getc)(f)) == EOF) {
6046 else if (c1 == '$') {
6048 if ((c1 = (*i_getc)(f)) == EOF) {
6049 /* don't send bogus code
6051 (*oconv)(0, '$'); */
6053 } else if (c1 == '@' || c1 == 'B') {
6055 set_input_mode(JIS_X_0208);
6057 } else if (c1 == '(') {
6059 if ((c1 = (*i_getc)(f)) == EOF) {
6060 /* don't send bogus code
6066 } else if (c1 == '@'|| c1 == 'B') {
6068 set_input_mode(JIS_X_0208);
6071 } else if (c1 == 'D'){
6072 set_input_mode(JIS_X_0212);
6074 #endif /* X0212_ENABLE */
6075 } else if (c1 == 'O' || c1 == 'Q'){
6076 set_input_mode(JIS_X_0213_1);
6078 } else if (c1 == 'P'){
6079 set_input_mode(JIS_X_0213_2);
6082 /* could be some special code */
6089 } else if (broken_f&0x2) {
6090 /* accept any ESC-(-x as broken code ... */
6091 input_mode = JIS_X_0208;
6100 } else if (c1 == '(') {
6102 if ((c1 = (*i_getc)(f)) == EOF) {
6103 /* don't send bogus code
6105 (*oconv)(0, '('); */
6108 else if (c1 == 'I') {
6109 /* JIS X 0201 Katakana */
6110 set_input_mode(JIS_X_0201_1976_K);
6114 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
6115 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
6116 set_input_mode(ASCII);
6119 else if (broken_f&0x2) {
6120 set_input_mode(ASCII);
6129 else if (c1 == '.') {
6131 if ((c1 = (*i_getc)(f)) == EOF) {
6134 else if (c1 == 'A') {
6145 else if (c1 == 'N') {
6148 if (g2 == ISO_8859_1) {
6164 } else if (c1 == ESC && iconv == s_iconv) {
6165 /* ESC in Shift_JIS */
6166 if ((c1 = (*i_getc)(f)) == EOF) {
6169 } else if (c1 == '$') {
6171 if ((c1 = (*i_getc)(f)) == EOF) {
6173 } else if (('E' <= c1 && c1 <= 'G') ||
6174 ('O' <= c1 && c1 <= 'Q')) {
6182 static const nkf_char jphone_emoji_first_table[7] =
6183 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
6184 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
6185 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6186 while (SP <= c1 && c1 <= 'z') {
6187 (*oconv)(0, c1 + c3);
6188 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6204 } else if (c1 == LF || c1 == CR) {
6206 input_mode = ASCII; set_iconv(FALSE, 0);
6208 } else if (mime_decode_f && !mime_decode_mode){
6210 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
6218 } else { /* if (c1 == CR)*/
6219 if ((c1=(*i_getc)(f))!=EOF) {
6223 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
6243 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
6246 if ((c3 = (*i_getc)(f)) != EOF) {
6249 if ((c4 = (*i_getc)(f)) != EOF) {
6251 (*iconv)(c2, c1, c3|c4);
6256 /* 4 bytes UTF-8 (check combining character) */
6257 if ((c3 = (*i_getc)(f)) != EOF) {
6258 if ((c4 = (*i_getc)(f)) != EOF) {
6259 if (w_iconv_combine(c2, c1, 0, c3, c4, 0)) {
6262 w_iconv_nocombine(c2, c1, 0);
6266 w_iconv_nocombine(c2, c1, 0);
6269 w_iconv_nocombine(c2, c1, 0);
6273 /* 3 bytes EUC or UTF-8 */
6274 if ((c3 = (*i_getc)(f)) != EOF) {
6276 if ((*iconv)(c2, c1, c3) == -3) {
6277 /* 6 bytes UTF-8 (check combining character) */
6279 if ((c4 = (*i_getc)(f)) != EOF) {
6280 if ((c5 = (*i_getc)(f)) != EOF) {
6281 if ((c6 = (*i_getc)(f)) != EOF) {
6282 if (w_iconv_combine(c2, c1, c3, c4, c5, c6)) {
6286 w_iconv_nocombine(c2, c1, c3);
6291 w_iconv_nocombine(c2, c1, c3);
6295 w_iconv_nocombine(c2, c1, c3);
6298 w_iconv_nocombine(c2, c1, c3);
6308 0x7F <= c2 && c2 <= 0x92 &&
6309 0x21 <= c1 && c1 <= 0x7E) {
6311 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
6314 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
6318 (*oconv)(PREFIX_EUCG3 | c2, c1);
6320 #endif /* X0212_ENABLE */
6322 (*oconv)(PREFIX_EUCG3 | c2, c1);
6325 (*oconv)(input_mode, c1); /* other special case */
6331 /* goto next_word */
6336 (*iconv)(EOF, 0, 0);
6337 if (!input_codename)
6340 struct input_code *p = input_code_list;
6341 struct input_code *result = p;
6343 if (p->score < result->score) result = p;
6346 set_input_codename(result->name);
6348 debug(result->name);
6356 * int options(unsigned char *cp)
6363 options(unsigned char *cp)
6367 unsigned char *cp_back = NULL;
6372 while(*cp && *cp++!='-');
6373 while (*cp || cp_back) {
6381 case '-': /* literal options */
6382 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
6386 for (i=0;i<(int)(sizeof(long_option)/sizeof(long_option[0]));i++) {
6387 p = (unsigned char *)long_option[i].name;
6388 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
6389 if (*p == cp[j] || cp[j] == SP){
6396 #if !defined(PERL_XS) && !defined(WIN32DLL)
6397 fprintf(stderr, "unknown long option: --%s\n", cp);
6401 while(*cp && *cp != SP && cp++);
6402 if (long_option[i].alias[0]){
6404 cp = (unsigned char *)long_option[i].alias;
6407 if (strcmp(long_option[i].name, "help") == 0){
6412 if (strcmp(long_option[i].name, "ic=") == 0){
6413 enc = nkf_enc_find((char *)p);
6415 input_encoding = enc;
6418 if (strcmp(long_option[i].name, "oc=") == 0){
6419 enc = nkf_enc_find((char *)p);
6420 /* if (enc <= 0) continue; */
6422 output_encoding = enc;
6425 if (strcmp(long_option[i].name, "guess=") == 0){
6426 if (p[0] == '0' || p[0] == '1') {
6434 if (strcmp(long_option[i].name, "overwrite") == 0){
6437 preserve_time_f = TRUE;
6440 if (strcmp(long_option[i].name, "overwrite=") == 0){
6443 preserve_time_f = TRUE;
6445 backup_suffix = (char *)p;
6448 if (strcmp(long_option[i].name, "in-place") == 0){
6451 preserve_time_f = FALSE;
6454 if (strcmp(long_option[i].name, "in-place=") == 0){
6457 preserve_time_f = FALSE;
6459 backup_suffix = (char *)p;
6464 if (strcmp(long_option[i].name, "cap-input") == 0){
6468 if (strcmp(long_option[i].name, "url-input") == 0){
6473 #ifdef NUMCHAR_OPTION
6474 if (strcmp(long_option[i].name, "numchar-input") == 0){
6480 if (strcmp(long_option[i].name, "no-output") == 0){
6484 if (strcmp(long_option[i].name, "debug") == 0){
6489 if (strcmp(long_option[i].name, "cp932") == 0){
6490 #ifdef SHIFTJIS_CP932
6494 #ifdef UTF8_OUTPUT_ENABLE
6495 ms_ucs_map_f = UCS_MAP_CP932;
6499 if (strcmp(long_option[i].name, "no-cp932") == 0){
6500 #ifdef SHIFTJIS_CP932
6504 #ifdef UTF8_OUTPUT_ENABLE
6505 ms_ucs_map_f = UCS_MAP_ASCII;
6509 #ifdef SHIFTJIS_CP932
6510 if (strcmp(long_option[i].name, "cp932inv") == 0){
6517 if (strcmp(long_option[i].name, "x0212") == 0){
6524 if (strcmp(long_option[i].name, "exec-in") == 0){
6528 if (strcmp(long_option[i].name, "exec-out") == 0){
6533 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
6534 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
6535 no_cp932ext_f = TRUE;
6538 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
6539 no_best_fit_chars_f = TRUE;
6542 if (strcmp(long_option[i].name, "fb-skip") == 0){
6543 encode_fallback = NULL;
6546 if (strcmp(long_option[i].name, "fb-html") == 0){
6547 encode_fallback = encode_fallback_html;
6550 if (strcmp(long_option[i].name, "fb-xml") == 0){
6551 encode_fallback = encode_fallback_xml;
6554 if (strcmp(long_option[i].name, "fb-java") == 0){
6555 encode_fallback = encode_fallback_java;
6558 if (strcmp(long_option[i].name, "fb-perl") == 0){
6559 encode_fallback = encode_fallback_perl;
6562 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6563 encode_fallback = encode_fallback_subchar;
6566 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6567 encode_fallback = encode_fallback_subchar;
6568 unicode_subchar = 0;
6570 /* decimal number */
6571 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6572 unicode_subchar *= 10;
6573 unicode_subchar += hex2bin(p[i]);
6575 }else if(p[1] == 'x' || p[1] == 'X'){
6576 /* hexadecimal number */
6577 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6578 unicode_subchar <<= 4;
6579 unicode_subchar |= hex2bin(p[i]);
6583 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6584 unicode_subchar *= 8;
6585 unicode_subchar += hex2bin(p[i]);
6588 w16e_conv(unicode_subchar, &i, &j);
6589 unicode_subchar = i<<8 | j;
6593 #ifdef UTF8_OUTPUT_ENABLE
6594 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6595 ms_ucs_map_f = UCS_MAP_MS;
6599 #ifdef UNICODE_NORMALIZATION
6600 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6605 if (strcmp(long_option[i].name, "prefix=") == 0){
6606 if (nkf_isgraph(p[0])){
6607 for (i = 1; nkf_isgraph(p[i]); i++){
6608 prefix_table[p[i]] = p[0];
6613 #if !defined(PERL_XS) && !defined(WIN32DLL)
6614 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6619 case 'b': /* buffered mode */
6622 case 'u': /* non bufferd mode */
6625 case 't': /* transparent mode */
6630 } else if (*cp=='2') {
6634 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6642 case 'j': /* JIS output */
6644 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6646 case 'e': /* AT&T EUC output */
6647 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6649 case 's': /* SJIS output */
6650 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6652 case 'l': /* ISO8859 Latin-1 support, no conversion */
6653 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6654 input_encoding = nkf_enc_from_index(ISO_8859_1);
6656 case 'i': /* Kanji IN ESC-$-@/B */
6657 if (*cp=='@'||*cp=='B')
6658 kanji_intro = *cp++;
6660 case 'o': /* ASCII IN ESC-(-J/B/H */
6661 /* ESC ( H was used in initial JUNET messages */
6662 if (*cp=='J'||*cp=='B'||*cp=='H')
6663 ascii_intro = *cp++;
6667 bit:1 katakana->hiragana
6668 bit:2 hiragana->katakana
6670 if ('9'>= *cp && *cp>='0')
6671 hira_f |= (*cp++ -'0');
6678 #if defined(MSDOS) || defined(__OS2__)
6685 show_configuration();
6693 #ifdef UTF8_OUTPUT_ENABLE
6694 case 'w': /* UTF-{8,16,32} output */
6699 output_encoding = nkf_enc_from_index(UTF_8N);
6701 output_bom_f = TRUE;
6702 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6706 if ('1'== cp[0] && '6'==cp[1]) {
6709 } else if ('3'== cp[0] && '2'==cp[1]) {
6713 output_encoding = nkf_enc_from_index(UTF_8);
6718 output_endian = ENDIAN_LITTLE;
6719 output_bom_f = TRUE;
6720 } else if (cp[0] == 'B') {
6722 output_bom_f = TRUE;
6725 output_bom_f = FALSE;
6727 enc_idx = enc_idx == UTF_16
6728 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6729 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6731 enc_idx = enc_idx == UTF_16
6732 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6733 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6735 output_encoding = nkf_enc_from_index(enc_idx);
6739 #ifdef UTF8_INPUT_ENABLE
6740 case 'W': /* UTF input */
6743 input_encoding = nkf_enc_from_index(UTF_8);
6746 if ('1'== cp[0] && '6'==cp[1]) {
6748 input_endian = ENDIAN_BIG;
6750 } else if ('3'== cp[0] && '2'==cp[1]) {
6752 input_endian = ENDIAN_BIG;
6755 input_encoding = nkf_enc_from_index(UTF_8);
6760 input_endian = ENDIAN_LITTLE;
6761 } else if (cp[0] == 'B') {
6763 input_endian = ENDIAN_BIG;
6765 enc_idx = (enc_idx == UTF_16
6766 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6767 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6768 input_encoding = nkf_enc_from_index(enc_idx);
6772 /* Input code assumption */
6773 case 'J': /* ISO-2022-JP input */
6774 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6776 case 'E': /* EUC-JP input */
6777 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6779 case 'S': /* Shift_JIS input */
6780 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6782 case 'Z': /* Convert X0208 alphabet to asii */
6784 bit:0 Convert JIS X 0208 Alphabet to ASCII
6785 bit:1 Convert Kankaku to one space
6786 bit:2 Convert Kankaku to two spaces
6787 bit:3 Convert HTML Entity
6788 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6790 while ('0'<= *cp && *cp <='4') {
6791 alpha_f |= 1 << (*cp++ - '0');
6795 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6796 x0201_f = FALSE; /* No X0201->X0208 conversion */
6798 ESC-(-I in JIS, EUC, MS Kanji
6799 SI/SO in JIS, EUC, MS Kanji
6800 SS2 in EUC, JIS, not in MS Kanji
6801 MS Kanji (0xa0-0xdf)
6803 ESC-(-I in JIS (0x20-0x5f)
6804 SS2 in EUC (0xa0-0xdf)
6805 0xa0-0xd in MS Kanji (0xa0-0xdf)
6808 case 'X': /* Convert X0201 kana to X0208 */
6811 case 'F': /* prserve new lines */
6812 fold_preserve_f = TRUE;
6813 case 'f': /* folding -f60 or -f */
6816 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6818 fold_len += *cp++ - '0';
6820 if (!(0<fold_len && fold_len<BUFSIZ))
6821 fold_len = DEFAULT_FOLD;
6825 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6827 fold_margin += *cp++ - '0';
6831 case 'm': /* MIME support */
6832 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6833 if (*cp=='B'||*cp=='Q') {
6834 mime_decode_mode = *cp++;
6835 mimebuf_f = FIXED_MIME;
6836 } else if (*cp=='N') {
6837 mime_f = TRUE; cp++;
6838 } else if (*cp=='S') {
6839 mime_f = STRICT_MIME; cp++;
6840 } else if (*cp=='0') {
6841 mime_decode_f = FALSE;
6842 mime_f = FALSE; cp++;
6844 mime_f = STRICT_MIME;
6847 case 'M': /* MIME output */
6850 mimeout_f = FIXED_MIME; cp++;
6851 } else if (*cp=='Q') {
6853 mimeout_f = FIXED_MIME; cp++;
6858 case 'B': /* Broken JIS support */
6860 bit:1 allow any x on ESC-(-x or ESC-$-x
6861 bit:2 reset to ascii on NL
6863 if ('9'>= *cp && *cp>='0')
6864 broken_f |= 1<<(*cp++ -'0');
6869 case 'O':/* for Output file */
6873 case 'c':/* add cr code */
6876 case 'd':/* delete cr code */
6879 case 'I': /* ISO-2022-JP output */
6882 case 'L': /* line mode */
6883 if (*cp=='u') { /* unix */
6884 eolmode_f = LF; cp++;
6885 } else if (*cp=='m') { /* mac */
6886 eolmode_f = CR; cp++;
6887 } else if (*cp=='w') { /* windows */
6888 eolmode_f = CRLF; cp++;
6889 } else if (*cp=='0') { /* no conversion */
6890 eolmode_f = 0; cp++;
6895 if ('2' <= *cp && *cp <= '9') {
6898 } else if (*cp == '0' || *cp == '1') {
6907 /* module multiple options in a string are allowed for Perl module */
6908 while(*cp && *cp++!='-');
6911 #if !defined(PERL_XS) && !defined(WIN32DLL)
6912 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6914 /* bogus option but ignored */
6922 #include "nkf32dll.c"
6923 #elif defined(PERL_XS)
6924 #else /* WIN32DLL */
6926 main(int argc, char **argv)
6931 char *outfname = NULL;
6934 #ifdef EASYWIN /*Easy Win */
6935 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6937 #ifdef DEFAULT_CODE_LOCALE
6938 setlocale(LC_CTYPE, "");
6942 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6943 cp = (unsigned char *)*argv;
6948 if (pipe(fds) < 0 || (pid = fork()) < 0){
6959 execvp(argv[1], &argv[1]);
6976 int debug_f_back = debug_f;
6979 int exec_f_back = exec_f;
6982 int x0212_f_back = x0212_f;
6984 int x0213_f_back = x0213_f;
6985 int guess_f_back = guess_f;
6987 guess_f = guess_f_back;
6990 debug_f = debug_f_back;
6993 exec_f = exec_f_back;
6995 x0212_f = x0212_f_back;
6996 x0213_f = x0213_f_back;
6999 if (binmode_f == TRUE)
7000 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7001 if (freopen("","wb",stdout) == NULL)
7008 setbuf(stdout, (char *) NULL);
7010 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
7013 if (binmode_f == TRUE)
7014 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7015 if (freopen("","rb",stdin) == NULL) return (-1);
7019 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
7023 kanji_convert(stdin);
7024 if (guess_f) print_guessed_code(NULL);
7028 int is_argument_error = FALSE;
7030 input_codename = NULL;
7033 iconv_for_check = 0;
7035 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
7037 is_argument_error = TRUE;
7045 /* reopen file for stdout */
7046 if (file_out_f == TRUE) {
7049 outfname = nkf_xmalloc(strlen(origfname)
7050 + strlen(".nkftmpXXXXXX")
7052 strcpy(outfname, origfname);
7056 for (i = strlen(outfname); i; --i){
7057 if (outfname[i - 1] == '/'
7058 || outfname[i - 1] == '\\'){
7064 strcat(outfname, "ntXXXXXX");
7066 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
7067 S_IREAD | S_IWRITE);
7069 strcat(outfname, ".nkftmpXXXXXX");
7070 fd = mkstemp(outfname);
7073 || (fd_backup = dup(fileno(stdout))) < 0
7074 || dup2(fd, fileno(stdout)) < 0
7085 outfname = "nkf.out";
7088 if(freopen(outfname, "w", stdout) == NULL) {
7092 if (binmode_f == TRUE) {
7093 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7094 if (freopen("","wb",stdout) == NULL)
7101 if (binmode_f == TRUE)
7102 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7103 if (freopen("","rb",fin) == NULL)
7108 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
7112 char *filename = NULL;
7114 if (nfiles > 1) filename = origfname;
7115 if (guess_f) print_guessed_code(filename);
7121 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7129 if (dup2(fd_backup, fileno(stdout)) < 0){
7132 if (stat(origfname, &sb)) {
7133 fprintf(stderr, "Can't stat %s\n", origfname);
7135 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
7136 if (chmod(outfname, sb.st_mode)) {
7137 fprintf(stderr, "Can't set permission %s\n", outfname);
7140 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
7141 if(preserve_time_f){
7142 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7143 tb[0] = tb[1] = sb.st_mtime;
7144 if (utime(outfname, tb)) {
7145 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7148 tb.actime = sb.st_atime;
7149 tb.modtime = sb.st_mtime;
7150 if (utime(outfname, &tb)) {
7151 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7156 char *backup_filename = get_backup_filename(backup_suffix, origfname);
7158 unlink(backup_filename);
7160 if (rename(origfname, backup_filename)) {
7161 perror(backup_filename);
7162 fprintf(stderr, "Can't rename %s to %s\n",
7163 origfname, backup_filename);
7165 nkf_xfree(backup_filename);
7168 if (unlink(origfname)){
7173 if (rename(outfname, origfname)) {
7175 fprintf(stderr, "Can't rename %s to %s\n",
7176 outfname, origfname);
7178 nkf_xfree(outfname);
7183 if (is_argument_error)
7186 #ifdef EASYWIN /*Easy Win */
7187 if (file_out_f == FALSE)
7188 scanf("%d",&end_check);
7191 #else /* for Other OS */
7192 if (file_out_f == TRUE)
7194 #endif /*Easy Win */
7197 #endif /* WIN32DLL */