2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2009, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.1.1"
24 #define NKF_RELEASE_DATE "2010-01-25"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2010, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
215 {"ISO-2022-JP", ISO_2022_JP},
216 {"ISO2022JP-CP932", CP50220},
217 {"CP50220", CP50220},
218 {"CP50221", CP50221},
219 {"CSISO2022JP", CP50221},
220 {"CP50222", CP50222},
221 {"ISO-2022-JP-1", ISO_2022_JP_1},
222 {"ISO-2022-JP-3", ISO_2022_JP_3},
223 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
224 {"SHIFT_JIS", SHIFT_JIS},
227 {"WINDOWS-31J", WINDOWS_31J},
228 {"CSWINDOWS31J", WINDOWS_31J},
229 {"CP932", WINDOWS_31J},
230 {"MS932", WINDOWS_31J},
231 {"CP10001", CP10001},
234 {"EUCJP-NKF", EUCJP_NKF},
235 {"CP51932", CP51932},
236 {"EUC-JP-MS", EUCJP_MS},
237 {"EUCJP-MS", EUCJP_MS},
238 {"EUCJPMS", EUCJP_MS},
239 {"EUC-JP-ASCII", EUCJP_ASCII},
240 {"EUCJP-ASCII", EUCJP_ASCII},
241 {"SHIFT_JISX0213", SHIFT_JISX0213},
242 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
243 {"EUC-JISX0213", EUC_JISX0213},
244 {"EUC-JIS-2004", EUC_JIS_2004},
247 {"UTF-8-BOM", UTF_8_BOM},
248 {"UTF8-MAC", UTF8_MAC},
249 {"UTF-8-MAC", UTF8_MAC},
251 {"UTF-16BE", UTF_16BE},
252 {"UTF-16BE-BOM", UTF_16BE_BOM},
253 {"UTF-16LE", UTF_16LE},
254 {"UTF-16LE-BOM", UTF_16LE_BOM},
256 {"UTF-32BE", UTF_32BE},
257 {"UTF-32BE-BOM", UTF_32BE_BOM},
258 {"UTF-32LE", UTF_32LE},
259 {"UTF-32LE-BOM", UTF_32LE_BOM},
264 #if defined(DEFAULT_CODE_JIS)
265 #define DEFAULT_ENCIDX ISO_2022_JP
266 #elif defined(DEFAULT_CODE_SJIS)
267 #define DEFAULT_ENCIDX SHIFT_JIS
268 #elif defined(DEFAULT_CODE_WINDOWS_31J)
269 #define DEFAULT_ENCIDX WINDOWS_31J
270 #elif defined(DEFAULT_CODE_EUC)
271 #define DEFAULT_ENCIDX EUC_JP
272 #elif defined(DEFAULT_CODE_UTF8)
273 #define DEFAULT_ENCIDX UTF_8
277 #define is_alnum(c) \
278 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
280 /* I don't trust portablity of toupper */
281 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
282 #define nkf_isoctal(c) ('0'<=c && c<='7')
283 #define nkf_isdigit(c) ('0'<=c && c<='9')
284 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
285 #define nkf_isblank(c) (c == SP || c == TAB)
286 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
287 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
288 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
289 #define nkf_isprint(c) (SP<=c && c<='~')
290 #define nkf_isgraph(c) ('!'<=c && c<='~')
291 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
292 ('A'<=c&&c<='F') ? (c-'A'+10) : \
293 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
294 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
295 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
296 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
297 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
298 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
300 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
301 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
303 #define HOLD_SIZE 1024
304 #if defined(INT_IS_SHORT)
305 #define IOBUF_SIZE 2048
307 #define IOBUF_SIZE 16384
310 #define DEFAULT_J 'B'
311 #define DEFAULT_R 'B'
318 /* MIME preprocessor */
320 #ifdef EASYWIN /*Easy Win */
321 extern POINT _BufferSize;
330 void (*status_func)(struct input_code *, nkf_char);
331 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
335 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
336 static nkf_encoding *input_encoding = NULL;
337 static nkf_encoding *output_encoding = NULL;
339 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
341 * 0: Shift_JIS, eucJP-ascii
346 #define UCS_MAP_ASCII 0
348 #define UCS_MAP_CP932 2
349 #define UCS_MAP_CP10001 3
350 static int ms_ucs_map_f = UCS_MAP_ASCII;
352 #ifdef UTF8_INPUT_ENABLE
353 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
354 static int no_cp932ext_f = FALSE;
355 /* ignore ZERO WIDTH NO-BREAK SPACE */
356 static int no_best_fit_chars_f = FALSE;
357 static int input_endian = ENDIAN_BIG;
358 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
359 static void (*encode_fallback)(nkf_char c) = NULL;
360 static void w_status(struct input_code *, nkf_char);
362 #ifdef UTF8_OUTPUT_ENABLE
363 static int output_bom_f = FALSE;
364 static int output_endian = ENDIAN_BIG;
367 static void std_putc(nkf_char c);
368 static nkf_char std_getc(FILE *f);
369 static nkf_char std_ungetc(nkf_char c,FILE *f);
371 static nkf_char broken_getc(FILE *f);
372 static nkf_char broken_ungetc(nkf_char c,FILE *f);
374 static nkf_char mime_getc(FILE *f);
376 static void mime_putc(nkf_char c);
380 #if !defined(PERL_XS) && !defined(WIN32DLL)
381 static unsigned char stdibuf[IOBUF_SIZE];
382 static unsigned char stdobuf[IOBUF_SIZE];
386 static int unbuf_f = FALSE;
387 static int estab_f = FALSE;
388 static int nop_f = FALSE;
389 static int binmode_f = TRUE; /* binary mode */
390 static int rot_f = FALSE; /* rot14/43 mode */
391 static int hira_f = FALSE; /* hira/kata henkan */
392 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
393 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
394 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
395 static int mimebuf_f = FALSE; /* MIME buffered input */
396 static int broken_f = FALSE; /* convert ESC-less broken JIS */
397 static int iso8859_f = FALSE; /* ISO8859 through */
398 static int mimeout_f = FALSE; /* base64 mode */
399 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
400 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
402 #ifdef UNICODE_NORMALIZATION
403 static int nfc_f = FALSE;
404 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
405 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
409 static int cap_f = FALSE;
410 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
411 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
413 static int url_f = FALSE;
414 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
415 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
418 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
419 #define CLASS_MASK NKF_INT32_C(0xFF000000)
420 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
421 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
422 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
423 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
424 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
425 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
426 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
427 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
428 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
430 #ifdef NUMCHAR_OPTION
431 static int numchar_f = FALSE;
432 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
433 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
437 static int noout_f = FALSE;
438 static void no_putc(nkf_char c);
439 static int debug_f = FALSE;
440 static void debug(const char *str);
441 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
444 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
445 static void set_input_codename(const char *codename);
448 static int exec_f = 0;
451 #ifdef SHIFTJIS_CP932
452 /* invert IBM extended characters to others */
453 static int cp51932_f = FALSE;
455 /* invert NEC-selected IBM extended characters to IBM extended characters */
456 static int cp932inv_f = TRUE;
458 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
459 #endif /* SHIFTJIS_CP932 */
461 static int x0212_f = FALSE;
462 static int x0213_f = FALSE;
464 static unsigned char prefix_table[256];
466 static void e_status(struct input_code *, nkf_char);
467 static void s_status(struct input_code *, nkf_char);
469 struct input_code input_code_list[] = {
470 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
471 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
472 #ifdef UTF8_INPUT_ENABLE
473 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
474 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
475 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
480 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
481 static int base64_count = 0;
483 /* X0208 -> ASCII converter */
486 static int f_line = 0; /* chars in line */
487 static int f_prev = 0;
488 static int fold_preserve_f = FALSE; /* preserve new lines */
489 static int fold_f = FALSE;
490 static int fold_len = 0;
493 static unsigned char kanji_intro = DEFAULT_J;
494 static unsigned char ascii_intro = DEFAULT_R;
498 #define FOLD_MARGIN 10
499 #define DEFAULT_FOLD 60
501 static int fold_margin = FOLD_MARGIN;
503 /* process default */
506 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
508 fprintf(stderr,"nkf internal module connection failure.\n");
514 no_connection(nkf_char c2, nkf_char c1)
516 no_connection2(c2,c1,0);
519 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
520 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
522 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
523 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
524 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
525 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
526 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
527 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
528 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
530 /* static redirections */
532 static void (*o_putc)(nkf_char c) = std_putc;
534 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
535 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
537 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
538 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
540 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
542 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
543 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
545 /* for strict mime */
546 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
547 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
550 static int output_mode = ASCII; /* output kanji mode */
551 static int input_mode = ASCII; /* input kanji mode */
552 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
554 /* X0201 / X0208 conversion tables */
556 /* X0201 kana conversion table */
558 static const unsigned char cv[]= {
559 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
560 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
561 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
562 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
563 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
564 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
565 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
566 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
567 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
568 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
569 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
570 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
571 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
572 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
573 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
574 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
578 /* X0201 kana conversion table for daguten */
580 static const unsigned char dv[]= {
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
585 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
586 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
587 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
588 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
589 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
590 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
592 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 /* X0201 kana conversion table for han-daguten */
601 static const unsigned char ev[]= {
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
613 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
621 /* X0208 kigou conversion table */
622 /* 0x8140 - 0x819e */
623 static const unsigned char fv[] = {
625 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
626 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
627 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
628 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
629 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
630 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
631 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
632 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
633 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
635 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
641 static int option_mode = 0;
642 static int file_out_f = FALSE;
644 static int overwrite_f = FALSE;
645 static int preserve_time_f = FALSE;
646 static int backup_f = FALSE;
647 static char *backup_suffix = "";
650 static int eolmode_f = 0; /* CR, LF, CRLF */
651 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
652 static nkf_char prev_cr = 0; /* CR or 0 */
653 #ifdef EASYWIN /*Easy Win */
654 static int end_check;
658 nkf_xmalloc(size_t size)
662 if (size == 0) size = 1;
666 perror("can't malloc");
674 nkf_xrealloc(void *ptr, size_t size)
676 if (size == 0) size = 1;
678 ptr = realloc(ptr, size);
680 perror("can't realloc");
687 #define nkf_xfree(ptr) free(ptr)
690 nkf_str_caseeql(const char *src, const char *target)
693 for (i = 0; src[i] && target[i]; i++) {
694 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
696 if (src[i] || target[i]) return FALSE;
701 nkf_enc_from_index(int idx)
703 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
706 return &nkf_encoding_table[idx];
710 nkf_enc_find_index(const char *name)
713 if (name[0] == 'X' && *(name+1) == '-') name += 2;
714 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
715 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
716 return encoding_name_to_id_table[i].id;
723 nkf_enc_find(const char *name)
726 idx = nkf_enc_find_index(name);
727 if (idx < 0) return 0;
728 return nkf_enc_from_index(idx);
731 #define nkf_enc_name(enc) (enc)->name
732 #define nkf_enc_to_index(enc) (enc)->id
733 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
734 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
735 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
736 #define nkf_enc_asciicompat(enc) (\
737 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
738 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
739 #define nkf_enc_unicode_p(enc) (\
740 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
741 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
742 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
743 #define nkf_enc_cp5022x_p(enc) (\
744 nkf_enc_to_index(enc) == CP50220 ||\
745 nkf_enc_to_index(enc) == CP50221 ||\
746 nkf_enc_to_index(enc) == CP50222)
748 #ifdef DEFAULT_CODE_LOCALE
752 #ifdef HAVE_LANGINFO_H
753 return nl_langinfo(CODESET);
754 #elif defined(__WIN32__)
756 sprintf(buf, "CP%d", GetACP());
758 #elif defined(__OS2__)
759 # if defined(INT_IS_SHORT)
765 ULONG ulCP[1], ulncp;
766 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
767 if (ulCP[0] == 932 || ulCP[0] == 943)
768 strcpy(buf, "Shift_JIS");
770 sprintf(buf, "CP%lu", ulCP[0]);
778 nkf_locale_encoding()
780 nkf_encoding *enc = 0;
781 const char *encname = nkf_locale_charmap();
783 enc = nkf_enc_find(encname);
786 #endif /* DEFAULT_CODE_LOCALE */
791 return &nkf_encoding_table[UTF_8];
795 nkf_default_encoding()
797 nkf_encoding *enc = 0;
798 #ifdef DEFAULT_CODE_LOCALE
799 enc = nkf_locale_encoding();
800 #elif defined(DEFAULT_ENCIDX)
801 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
803 if (!enc) enc = nkf_utf8_encoding();
814 nkf_buf_new(int length)
816 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
817 buf->ptr = nkf_xmalloc(length);
825 nkf_buf_dispose(nkf_buf_t *buf)
832 #define nkf_buf_length(buf) ((buf)->len)
833 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
836 nkf_buf_at(nkf_buf_t *buf, int index)
838 assert(index <= buf->len);
839 return buf->ptr[index];
843 nkf_buf_clear(nkf_buf_t *buf)
849 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
851 if (buf->capa <= buf->len) {
854 buf->ptr[buf->len++] = c;
858 nkf_buf_pop(nkf_buf_t *buf)
860 assert(!nkf_buf_empty_p(buf));
861 return buf->ptr[--buf->len];
864 /* Normalization Form C */
867 #define fprintf dllprintf
873 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
880 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
881 #ifdef UTF8_OUTPUT_ENABLE
882 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
883 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
886 #ifdef UTF8_INPUT_ENABLE
887 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
888 " UTF option is -W[8,[16,32][B,L]]\n"
890 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
894 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
895 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
896 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
899 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
900 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
901 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
902 " X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
905 " O Output to File (DEFAULT 'nkf.out')\n"
906 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
909 " --ic=<encoding> Specify the input encoding\n"
910 " --oc=<encoding> Specify the output encoding\n"
911 " --hiragana --katakana Hiragana/Katakana Conversion\n"
912 " --katakana-hiragana Converts each other\n"
916 " --{cap, url}-input Convert hex after ':' or '%%'\n"
918 #ifdef NUMCHAR_OPTION
919 " --numchar-input Convert Unicode Character Reference\n"
921 #ifdef UTF8_INPUT_ENABLE
922 " --fb-{skip, html, xml, perl, java, subchar}\n"
923 " Specify unassigned character's replacement\n"
928 " --in-place[=SUF] Overwrite original files\n"
929 " --overwrite[=SUF] Preserve timestamp of original files\n"
931 " -g --guess Guess the input code\n"
932 " -v --version Print the version\n"
933 " --help/-V Print this help / configuration\n"
939 show_configuration(void)
942 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
943 " Compile-time options:\n"
944 " Compiled at: " __DATE__ " " __TIME__ "\n"
947 " Default output encoding: "
948 #ifdef DEFAULT_CODE_LOCALE
949 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
950 #elif defined(DEFAULT_ENCIDX)
951 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
957 " Default output end of line: "
958 #if DEFAULT_NEWLINE == CR
960 #elif DEFAULT_NEWLINE == CRLF
966 " Decode MIME encoded string: "
967 #if MIME_DECODE_DEFAULT
973 " Convert JIS X 0201 Katakana: "
980 " --help, --version output: "
981 #if HELP_OUTPUT_HELP_OUTPUT
992 get_backup_filename(const char *suffix, const char *filename)
994 char *backup_filename;
995 int asterisk_count = 0;
997 int filename_length = strlen(filename);
999 for(i = 0; suffix[i]; i++){
1000 if(suffix[i] == '*') asterisk_count++;
1004 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1005 for(i = 0, j = 0; suffix[i];){
1006 if(suffix[i] == '*'){
1007 backup_filename[j] = '\0';
1008 strncat(backup_filename, filename, filename_length);
1010 j += filename_length;
1012 backup_filename[j++] = suffix[i++];
1015 backup_filename[j] = '\0';
1017 j = filename_length + strlen(suffix);
1018 backup_filename = nkf_xmalloc(j + 1);
1019 strcpy(backup_filename, filename);
1020 strcat(backup_filename, suffix);
1021 backup_filename[j] = '\0';
1023 return backup_filename;
1027 #ifdef UTF8_INPUT_ENABLE
1029 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1034 if(c >= NKF_INT32_C(1)<<shift){
1036 (*f)(0, bin2hex(c>>shift));
1047 encode_fallback_html(nkf_char c)
1052 if(c >= NKF_INT32_C(1000000))
1053 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1054 if(c >= NKF_INT32_C(100000))
1055 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1057 (*oconv)(0, 0x30+(c/10000 )%10);
1059 (*oconv)(0, 0x30+(c/1000 )%10);
1061 (*oconv)(0, 0x30+(c/100 )%10);
1063 (*oconv)(0, 0x30+(c/10 )%10);
1065 (*oconv)(0, 0x30+ c %10);
1071 encode_fallback_xml(nkf_char c)
1076 nkf_each_char_to_hex(oconv, c);
1082 encode_fallback_java(nkf_char c)
1086 if(!nkf_char_unicode_bmp_p(c)){
1090 (*oconv)(0, bin2hex(c>>20));
1091 (*oconv)(0, bin2hex(c>>16));
1095 (*oconv)(0, bin2hex(c>>12));
1096 (*oconv)(0, bin2hex(c>> 8));
1097 (*oconv)(0, bin2hex(c>> 4));
1098 (*oconv)(0, bin2hex(c ));
1103 encode_fallback_perl(nkf_char c)
1108 nkf_each_char_to_hex(oconv, c);
1114 encode_fallback_subchar(nkf_char c)
1116 c = unicode_subchar;
1117 (*oconv)((c>>8)&0xFF, c&0xFF);
1122 static const struct {
1146 {"katakana-hiragana","h3"},
1154 #ifdef UTF8_OUTPUT_ENABLE
1164 {"fb-subchar=", ""},
1166 #ifdef UTF8_INPUT_ENABLE
1167 {"utf8-input", "W"},
1168 {"utf16-input", "W16"},
1169 {"no-cp932ext", ""},
1170 {"no-best-fit-chars",""},
1172 #ifdef UNICODE_NORMALIZATION
1173 {"utf8mac-input", ""},
1185 #ifdef NUMCHAR_OPTION
1186 {"numchar-input", ""},
1192 #ifdef SHIFTJIS_CP932
1203 set_input_encoding(nkf_encoding *enc)
1205 switch (nkf_enc_to_index(enc)) {
1213 #ifdef SHIFTJIS_CP932
1216 #ifdef UTF8_OUTPUT_ENABLE
1217 ms_ucs_map_f = UCS_MAP_CP932;
1227 case ISO_2022_JP_2004:
1235 #ifdef SHIFTJIS_CP932
1238 #ifdef UTF8_OUTPUT_ENABLE
1239 ms_ucs_map_f = UCS_MAP_CP932;
1244 #ifdef SHIFTJIS_CP932
1247 #ifdef UTF8_OUTPUT_ENABLE
1248 ms_ucs_map_f = UCS_MAP_CP10001;
1257 #ifdef SHIFTJIS_CP932
1260 #ifdef UTF8_OUTPUT_ENABLE
1261 ms_ucs_map_f = UCS_MAP_CP932;
1265 #ifdef SHIFTJIS_CP932
1268 #ifdef UTF8_OUTPUT_ENABLE
1269 ms_ucs_map_f = UCS_MAP_MS;
1273 #ifdef SHIFTJIS_CP932
1276 #ifdef UTF8_OUTPUT_ENABLE
1277 ms_ucs_map_f = UCS_MAP_ASCII;
1280 case SHIFT_JISX0213:
1281 case SHIFT_JIS_2004:
1283 #ifdef SHIFTJIS_CP932
1290 #ifdef SHIFTJIS_CP932
1294 #ifdef UTF8_INPUT_ENABLE
1295 #ifdef UNICODE_NORMALIZATION
1303 input_endian = ENDIAN_BIG;
1307 input_endian = ENDIAN_LITTLE;
1312 input_endian = ENDIAN_BIG;
1316 input_endian = ENDIAN_LITTLE;
1323 set_output_encoding(nkf_encoding *enc)
1325 switch (nkf_enc_to_index(enc)) {
1328 #ifdef SHIFTJIS_CP932
1329 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1331 #ifdef UTF8_OUTPUT_ENABLE
1332 ms_ucs_map_f = UCS_MAP_CP932;
1337 #ifdef SHIFTJIS_CP932
1338 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1340 #ifdef UTF8_OUTPUT_ENABLE
1341 ms_ucs_map_f = UCS_MAP_CP932;
1345 #ifdef SHIFTJIS_CP932
1346 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1351 #ifdef SHIFTJIS_CP932
1352 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1358 #ifdef SHIFTJIS_CP932
1359 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1366 #ifdef UTF8_OUTPUT_ENABLE
1367 ms_ucs_map_f = UCS_MAP_CP932;
1371 #ifdef UTF8_OUTPUT_ENABLE
1372 ms_ucs_map_f = UCS_MAP_CP10001;
1377 #ifdef SHIFTJIS_CP932
1378 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1380 #ifdef UTF8_OUTPUT_ENABLE
1381 ms_ucs_map_f = UCS_MAP_ASCII;
1386 #ifdef SHIFTJIS_CP932
1387 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1389 #ifdef UTF8_OUTPUT_ENABLE
1390 ms_ucs_map_f = UCS_MAP_ASCII;
1395 #ifdef SHIFTJIS_CP932
1396 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1398 #ifdef UTF8_OUTPUT_ENABLE
1399 ms_ucs_map_f = UCS_MAP_CP932;
1404 #ifdef UTF8_OUTPUT_ENABLE
1405 ms_ucs_map_f = UCS_MAP_MS;
1410 #ifdef UTF8_OUTPUT_ENABLE
1411 ms_ucs_map_f = UCS_MAP_ASCII;
1414 case SHIFT_JISX0213:
1415 case SHIFT_JIS_2004:
1417 #ifdef SHIFTJIS_CP932
1418 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1425 #ifdef SHIFTJIS_CP932
1426 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1429 #ifdef UTF8_OUTPUT_ENABLE
1431 output_bom_f = TRUE;
1435 output_bom_f = TRUE;
1438 output_endian = ENDIAN_LITTLE;
1439 output_bom_f = FALSE;
1442 output_endian = ENDIAN_LITTLE;
1443 output_bom_f = TRUE;
1447 output_bom_f = TRUE;
1450 output_endian = ENDIAN_LITTLE;
1451 output_bom_f = FALSE;
1454 output_endian = ENDIAN_LITTLE;
1455 output_bom_f = TRUE;
1461 static struct input_code*
1462 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1465 struct input_code *p = input_code_list;
1467 if (iconv_func == p->iconv_func){
1477 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1479 #ifdef INPUT_CODE_FIX
1480 if (f || !input_encoding)
1487 #ifdef INPUT_CODE_FIX
1488 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1494 if (estab_f && iconv_for_check != iconv){
1495 struct input_code *p = find_inputcode_byfunc(iconv);
1497 set_input_codename(p->name);
1500 iconv_for_check = iconv;
1507 x0212_shift(nkf_char c)
1512 if (0x75 <= c && c <= 0x7f){
1513 ret = c + (0x109 - 0x75);
1516 if (0x75 <= c && c <= 0x7f){
1517 ret = c + (0x113 - 0x75);
1525 x0212_unshift(nkf_char c)
1528 if (0x7f <= c && c <= 0x88){
1529 ret = c + (0x75 - 0x7f);
1530 }else if (0x89 <= c && c <= 0x92){
1531 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1535 #endif /* X0212_ENABLE */
1538 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1544 if((0x21 <= ndx && ndx <= 0x2F)){
1545 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1546 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1548 }else if(0x6E <= ndx && ndx <= 0x7E){
1549 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1550 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1556 else if(nkf_isgraph(ndx)){
1558 const unsigned short *ptr;
1559 ptr = x0212_shiftjis[ndx - 0x21];
1561 val = ptr[(c1 & 0x7f) - 0x21];
1570 c2 = x0212_shift(c2);
1572 #endif /* X0212_ENABLE */
1574 if(0x7F < c2) return 1;
1575 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1576 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1581 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1583 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1586 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1587 if (0xFC < c1) return 1;
1588 #ifdef SHIFTJIS_CP932
1589 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1590 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1597 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1598 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1604 #endif /* SHIFTJIS_CP932 */
1606 if (!x0213_f && is_ibmext_in_sjis(c2)){
1607 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1610 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1623 if(x0213_f && c2 >= 0xF0){
1624 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1625 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1626 }else{ /* 78<=k<=94 */
1627 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1628 if (0x9E < c1) c2++;
1631 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1632 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1633 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1634 if (0x9E < c1) c2++;
1637 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1644 c2 = x0212_unshift(c2);
1651 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1653 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1661 }else if (val < 0x800){
1662 *p1 = 0xc0 | (val >> 6);
1663 *p2 = 0x80 | (val & 0x3f);
1666 } else if (nkf_char_unicode_bmp_p(val)) {
1667 *p1 = 0xe0 | (val >> 12);
1668 *p2 = 0x80 | ((val >> 6) & 0x3f);
1669 *p3 = 0x80 | ( val & 0x3f);
1671 } else if (nkf_char_unicode_value_p(val)) {
1672 *p1 = 0xf0 | (val >> 18);
1673 *p2 = 0x80 | ((val >> 12) & 0x3f);
1674 *p3 = 0x80 | ((val >> 6) & 0x3f);
1675 *p4 = 0x80 | ( val & 0x3f);
1685 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1692 else if (c1 <= 0xC3) {
1693 /* trail byte or invalid */
1696 else if (c1 <= 0xDF) {
1698 wc = (c1 & 0x1F) << 6;
1701 else if (c1 <= 0xEF) {
1703 wc = (c1 & 0x0F) << 12;
1704 wc |= (c2 & 0x3F) << 6;
1707 else if (c2 <= 0xF4) {
1709 wc = (c1 & 0x0F) << 18;
1710 wc |= (c2 & 0x3F) << 12;
1711 wc |= (c3 & 0x3F) << 6;
1721 #ifdef UTF8_INPUT_ENABLE
1723 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1724 const unsigned short *const *pp, nkf_char psize,
1725 nkf_char *p2, nkf_char *p1)
1728 const unsigned short *p;
1731 if (pp == 0) return 1;
1734 if (c1 < 0 || psize <= c1) return 1;
1736 if (p == 0) return 1;
1739 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1741 if (val == 0) return 1;
1742 if (no_cp932ext_f && (
1743 (val>>8) == 0x2D || /* NEC special characters */
1744 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1752 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1760 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1762 const unsigned short *const *pp;
1763 const unsigned short *const *const *ppp;
1764 static const char no_best_fit_chars_table_C2[] =
1765 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1766 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1767 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1768 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1769 static const char no_best_fit_chars_table_C2_ms[] =
1770 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1771 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1772 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1773 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1774 static const char no_best_fit_chars_table_932_C2[] =
1775 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1776 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1777 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1778 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1779 static const char no_best_fit_chars_table_932_C3[] =
1780 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1781 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1783 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1789 }else if(c2 < 0xe0){
1790 if(no_best_fit_chars_f){
1791 if(ms_ucs_map_f == UCS_MAP_CP932){
1794 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1797 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1800 }else if(!cp932inv_f){
1803 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1806 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1809 }else if(ms_ucs_map_f == UCS_MAP_MS){
1810 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1811 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1829 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1830 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1831 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1833 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1834 }else if(c0 < 0xF0){
1835 if(no_best_fit_chars_f){
1836 if(ms_ucs_map_f == UCS_MAP_CP932){
1837 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1838 }else if(ms_ucs_map_f == UCS_MAP_MS){
1843 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1846 if(c0 == 0x92) return 1;
1851 if(c1 == 0x80 || c0 == 0x9C) return 1;
1854 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1859 if(c0 == 0x94) return 1;
1862 if(c0 == 0xBB) return 1;
1872 if(c0 == 0x95) return 1;
1875 if(c0 == 0xA5) return 1;
1882 if(c0 == 0x8D) return 1;
1885 if(c0 == 0x9E && !cp932inv_f) return 1;
1888 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1896 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1897 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1898 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1900 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1902 #ifdef SHIFTJIS_CP932
1903 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1905 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1906 s2e_conv(s2, s1, p2, p1);
1915 #ifdef UTF8_OUTPUT_ENABLE
1917 e2w_conv(nkf_char c2, nkf_char c1)
1919 const unsigned short *p;
1921 if (c2 == JIS_X_0201_1976_K) {
1922 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1930 p = euc_to_utf8_1byte;
1932 } else if (is_eucg3(c2)){
1933 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1936 c2 = (c2&0x7f) - 0x21;
1937 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1938 p = x0212_to_utf8_2bytes[c2];
1944 c2 = (c2&0x7f) - 0x21;
1945 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1947 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1948 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1949 euc_to_utf8_2bytes_ms[c2];
1954 c1 = (c1 & 0x7f) - 0x21;
1955 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1962 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1969 }else if (0xc0 <= c2 && c2 <= 0xef) {
1970 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1971 #ifdef NUMCHAR_OPTION
1974 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1982 #ifdef UTF8_INPUT_ENABLE
1984 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1986 nkf_char c1, c2, c3, c4;
1993 else if (nkf_char_unicode_bmp_p(val)){
1994 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1995 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1998 *p1 = nkf_char_unicode_new(val);
2004 *p1 = nkf_char_unicode_new(val);
2011 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2013 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2014 if (iso2022jp_f && !x0201_f) {
2015 c2 = GETA1; c1 = GETA2;
2017 c2 = JIS_X_0201_1976_K;
2021 }else if (c2 == 0x8f){
2025 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2026 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2027 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2030 c2 = (c2 << 8) | (c1 & 0x7f);
2032 #ifdef SHIFTJIS_CP932
2035 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2036 s2e_conv(s2, s1, &c2, &c1);
2043 #endif /* SHIFTJIS_CP932 */
2045 #endif /* X0212_ENABLE */
2046 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2049 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2050 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2051 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2056 #ifdef SHIFTJIS_CP932
2057 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2059 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2060 s2e_conv(s2, s1, &c2, &c1);
2067 #endif /* SHIFTJIS_CP932 */
2075 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2077 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2078 if (iso2022jp_f && !x0201_f) {
2079 c2 = GETA1; c1 = GETA2;
2083 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2085 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2087 if(c1 == 0x7F) return 0;
2088 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2091 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2092 if (ret) return ret;
2099 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2101 nkf_char ret = 0, c4 = 0;
2102 static const char w_iconv_utf8_1st_byte[] =
2104 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2105 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2106 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2107 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2114 if (c1 < 0 || 0xff < c1) {
2115 }else if (c1 == 0) { /* 0 : 1 byte*/
2117 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2120 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2122 if (c2 < 0x80 || 0xBF < c2) return 0;
2125 if (c3 == 0) return -1;
2126 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2131 if (c3 == 0) return -1;
2132 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2136 if (c3 == 0) return -1;
2137 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2141 if (c3 == 0) return -2;
2142 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2146 if (c3 == 0) return -2;
2147 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2151 if (c3 == 0) return -2;
2152 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2160 if (c1 == 0 || c1 == EOF){
2161 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2162 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2165 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2173 #define NKF_ICONV_INVALID_CODE_RANGE -13
2175 unicode_iconv(nkf_char wc)
2183 }else if ((wc>>11) == 27) {
2184 /* unpaired surrogate */
2185 return NKF_ICONV_INVALID_CODE_RANGE;
2186 }else if (wc < 0xFFFF) {
2187 ret = w16e_conv(wc, &c2, &c1);
2188 if (ret) return ret;
2189 }else if (wc < 0x10FFFF) {
2191 c1 = nkf_char_unicode_new(wc);
2193 return NKF_ICONV_INVALID_CODE_RANGE;
2199 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2200 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2201 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2203 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2212 if (input_endian == ENDIAN_BIG) {
2213 if (0xD8 <= c1 && c1 <= 0xDB) {
2214 if (0xDC <= c3 && c3 <= 0xDF) {
2215 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2216 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2221 if (0xD8 <= c2 && c2 <= 0xDB) {
2222 if (0xDC <= c4 && c4 <= 0xDF) {
2223 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2224 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2230 return (*unicode_iconv)(wc);
2234 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2237 return 16; /* different from w_iconv32 */
2241 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2244 return 32; /* different from w_iconv16 */
2248 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2257 switch(input_endian){
2259 wc = c2 << 16 | c3 << 8 | c4;
2262 wc = c3 << 16 | c2 << 8 | c1;
2265 wc = c1 << 16 | c4 << 8 | c3;
2268 wc = c4 << 16 | c1 << 8 | c2;
2271 return NKF_ICONV_INVALID_CODE_RANGE;
2274 return (*unicode_iconv)(wc);
2278 #define output_ascii_escape_sequence(mode) do { \
2279 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2282 (*o_putc)(ascii_intro); \
2283 output_mode = mode; \
2288 output_escape_sequence(int mode)
2290 if (output_mode == mode)
2298 case JIS_X_0201_1976_K:
2306 (*o_putc)(kanji_intro);
2331 j_oconv(nkf_char c2, nkf_char c1)
2333 #ifdef NUMCHAR_OPTION
2334 if (c2 == 0 && nkf_char_unicode_p(c1)){
2335 w16e_conv(c1, &c2, &c1);
2336 if (c2 == 0 && nkf_char_unicode_p(c1)){
2337 c2 = c1 & VALUE_MASK;
2338 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2341 c2 = 0x7F + c1 / 94;
2342 c1 = 0x21 + c1 % 94;
2344 if (encode_fallback) (*encode_fallback)(c1);
2351 output_ascii_escape_sequence(ASCII);
2354 else if (c2 == EOF) {
2355 output_ascii_escape_sequence(ASCII);
2358 else if (c2 == ISO_8859_1) {
2359 output_ascii_escape_sequence(ISO_8859_1);
2362 else if (c2 == JIS_X_0201_1976_K) {
2363 output_escape_sequence(JIS_X_0201_1976_K);
2366 } else if (is_eucg3(c2)){
2367 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2368 (*o_putc)(c2 & 0x7f);
2373 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2374 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2375 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2382 e_oconv(nkf_char c2, nkf_char c1)
2384 if (c2 == 0 && nkf_char_unicode_p(c1)){
2385 w16e_conv(c1, &c2, &c1);
2386 if (c2 == 0 && nkf_char_unicode_p(c1)){
2387 c2 = c1 & VALUE_MASK;
2388 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2392 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2393 c1 = 0x21 + c1 % 94;
2396 (*o_putc)((c2 & 0x7f) | 0x080);
2397 (*o_putc)(c1 | 0x080);
2399 (*o_putc)((c2 & 0x7f) | 0x080);
2400 (*o_putc)(c1 | 0x080);
2404 if (encode_fallback) (*encode_fallback)(c1);
2412 } else if (c2 == 0) {
2413 output_mode = ASCII;
2415 } else if (c2 == JIS_X_0201_1976_K) {
2416 output_mode = EUC_JP;
2417 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2418 } else if (c2 == ISO_8859_1) {
2419 output_mode = ISO_8859_1;
2420 (*o_putc)(c1 | 0x080);
2422 } else if (is_eucg3(c2)){
2423 output_mode = EUC_JP;
2424 #ifdef SHIFTJIS_CP932
2427 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2428 s2e_conv(s2, s1, &c2, &c1);
2433 output_mode = ASCII;
2435 }else if (is_eucg3(c2)){
2438 (*o_putc)((c2 & 0x7f) | 0x080);
2439 (*o_putc)(c1 | 0x080);
2442 (*o_putc)((c2 & 0x7f) | 0x080);
2443 (*o_putc)(c1 | 0x080);
2447 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2448 set_iconv(FALSE, 0);
2449 return; /* too late to rescue this char */
2451 output_mode = EUC_JP;
2452 (*o_putc)(c2 | 0x080);
2453 (*o_putc)(c1 | 0x080);
2458 s_oconv(nkf_char c2, nkf_char c1)
2460 #ifdef NUMCHAR_OPTION
2461 if (c2 == 0 && nkf_char_unicode_p(c1)){
2462 w16e_conv(c1, &c2, &c1);
2463 if (c2 == 0 && nkf_char_unicode_p(c1)){
2464 c2 = c1 & VALUE_MASK;
2465 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2468 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2470 c1 += 0x40 + (c1 > 0x3e);
2475 if(encode_fallback)(*encode_fallback)(c1);
2484 } else if (c2 == 0) {
2485 output_mode = ASCII;
2487 } else if (c2 == JIS_X_0201_1976_K) {
2488 output_mode = SHIFT_JIS;
2490 } else if (c2 == ISO_8859_1) {
2491 output_mode = ISO_8859_1;
2492 (*o_putc)(c1 | 0x080);
2494 } else if (is_eucg3(c2)){
2495 output_mode = SHIFT_JIS;
2496 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2502 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2503 set_iconv(FALSE, 0);
2504 return; /* too late to rescue this char */
2506 output_mode = SHIFT_JIS;
2507 e2s_conv(c2, c1, &c2, &c1);
2509 #ifdef SHIFTJIS_CP932
2511 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2512 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2518 #endif /* SHIFTJIS_CP932 */
2521 if (prefix_table[(unsigned char)c1]){
2522 (*o_putc)(prefix_table[(unsigned char)c1]);
2528 #ifdef UTF8_OUTPUT_ENABLE
2530 w_oconv(nkf_char c2, nkf_char c1)
2536 output_bom_f = FALSE;
2547 if (c2 == 0 && nkf_char_unicode_p(c1)){
2548 val = c1 & VALUE_MASK;
2549 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2551 if (c2) (*o_putc)(c2);
2552 if (c3) (*o_putc)(c3);
2553 if (c4) (*o_putc)(c4);
2560 val = e2w_conv(c2, c1);
2562 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2564 if (c2) (*o_putc)(c2);
2565 if (c3) (*o_putc)(c3);
2566 if (c4) (*o_putc)(c4);
2572 w_oconv16(nkf_char c2, nkf_char c1)
2575 output_bom_f = FALSE;
2576 if (output_endian == ENDIAN_LITTLE){
2590 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2591 if (nkf_char_unicode_bmp_p(c1)) {
2592 c2 = (c1 >> 8) & 0xff;
2596 if (c1 <= UNICODE_MAX) {
2597 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2598 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2599 if (output_endian == ENDIAN_LITTLE){
2600 (*o_putc)(c2 & 0xff);
2601 (*o_putc)((c2 >> 8) & 0xff);
2602 (*o_putc)(c1 & 0xff);
2603 (*o_putc)((c1 >> 8) & 0xff);
2605 (*o_putc)((c2 >> 8) & 0xff);
2606 (*o_putc)(c2 & 0xff);
2607 (*o_putc)((c1 >> 8) & 0xff);
2608 (*o_putc)(c1 & 0xff);
2614 nkf_char val = e2w_conv(c2, c1);
2615 c2 = (val >> 8) & 0xff;
2620 if (output_endian == ENDIAN_LITTLE){
2630 w_oconv32(nkf_char c2, nkf_char c1)
2633 output_bom_f = FALSE;
2634 if (output_endian == ENDIAN_LITTLE){
2652 if (c2 == ISO_8859_1) {
2654 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2657 c1 = e2w_conv(c2, c1);
2660 if (output_endian == ENDIAN_LITTLE){
2661 (*o_putc)( c1 & 0xFF);
2662 (*o_putc)((c1 >> 8) & 0xFF);
2663 (*o_putc)((c1 >> 16) & 0xFF);
2667 (*o_putc)((c1 >> 16) & 0xFF);
2668 (*o_putc)((c1 >> 8) & 0xFF);
2669 (*o_putc)( c1 & 0xFF);
2674 #define SCORE_L2 (1) /* Kanji Level 2 */
2675 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2676 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2677 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2678 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2679 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */
2680 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2681 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2683 #define SCORE_INIT (SCORE_iMIME)
2685 static const nkf_char score_table_A0[] = {
2688 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2689 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2692 static const nkf_char score_table_F0[] = {
2693 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2694 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2695 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2696 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2700 set_code_score(struct input_code *ptr, nkf_char score)
2703 ptr->score |= score;
2708 clr_code_score(struct input_code *ptr, nkf_char score)
2711 ptr->score &= ~score;
2716 code_score(struct input_code *ptr)
2718 nkf_char c2 = ptr->buf[0];
2719 #ifdef UTF8_OUTPUT_ENABLE
2720 nkf_char c1 = ptr->buf[1];
2723 set_code_score(ptr, SCORE_ERROR);
2724 }else if (c2 == SS2){
2725 set_code_score(ptr, SCORE_KANA);
2726 }else if (c2 == 0x8f){
2727 set_code_score(ptr, SCORE_X0212);
2728 #ifdef UTF8_OUTPUT_ENABLE
2729 }else if (!e2w_conv(c2, c1)){
2730 set_code_score(ptr, SCORE_NO_EXIST);
2732 }else if ((c2 & 0x70) == 0x20){
2733 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2734 }else if ((c2 & 0x70) == 0x70){
2735 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2736 }else if ((c2 & 0x70) >= 0x50){
2737 set_code_score(ptr, SCORE_L2);
2742 status_disable(struct input_code *ptr)
2747 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2751 status_push_ch(struct input_code *ptr, nkf_char c)
2753 ptr->buf[ptr->index++] = c;
2757 status_clear(struct input_code *ptr)
2764 status_reset(struct input_code *ptr)
2767 ptr->score = SCORE_INIT;
2771 status_reinit(struct input_code *ptr)
2774 ptr->_file_stat = 0;
2778 status_check(struct input_code *ptr, nkf_char c)
2780 if (c <= DEL && estab_f){
2786 s_status(struct input_code *ptr, nkf_char c)
2790 status_check(ptr, c);
2795 }else if (nkf_char_unicode_p(c)){
2797 }else if (0xa1 <= c && c <= 0xdf){
2798 status_push_ch(ptr, SS2);
2799 status_push_ch(ptr, c);
2802 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2804 status_push_ch(ptr, c);
2805 }else if (0xed <= c && c <= 0xee){
2807 status_push_ch(ptr, c);
2808 #ifdef SHIFTJIS_CP932
2809 }else if (is_ibmext_in_sjis(c)){
2811 status_push_ch(ptr, c);
2812 #endif /* SHIFTJIS_CP932 */
2814 }else if (0xf0 <= c && c <= 0xfc){
2816 status_push_ch(ptr, c);
2817 #endif /* X0212_ENABLE */
2819 status_disable(ptr);
2823 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2824 status_push_ch(ptr, c);
2825 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2829 status_disable(ptr);
2833 #ifdef SHIFTJIS_CP932
2834 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2835 status_push_ch(ptr, c);
2836 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2837 set_code_score(ptr, SCORE_CP932);
2842 #endif /* SHIFTJIS_CP932 */
2843 status_disable(ptr);
2846 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2847 status_push_ch(ptr, c);
2848 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2849 set_code_score(ptr, SCORE_CP932);
2852 status_disable(ptr);
2859 e_status(struct input_code *ptr, nkf_char c)
2863 status_check(ptr, c);
2868 }else if (nkf_char_unicode_p(c)){
2870 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2872 status_push_ch(ptr, c);
2874 }else if (0x8f == c){
2876 status_push_ch(ptr, c);
2877 #endif /* X0212_ENABLE */
2879 status_disable(ptr);
2883 if (0xa1 <= c && c <= 0xfe){
2884 status_push_ch(ptr, c);
2888 status_disable(ptr);
2893 if (0xa1 <= c && c <= 0xfe){
2895 status_push_ch(ptr, c);
2897 status_disable(ptr);
2899 #endif /* X0212_ENABLE */
2903 #ifdef UTF8_INPUT_ENABLE
2905 w_status(struct input_code *ptr, nkf_char c)
2909 status_check(ptr, c);
2914 }else if (nkf_char_unicode_p(c)){
2916 }else if (0xc0 <= c && c <= 0xdf){
2918 status_push_ch(ptr, c);
2919 }else if (0xe0 <= c && c <= 0xef){
2921 status_push_ch(ptr, c);
2922 }else if (0xf0 <= c && c <= 0xf4){
2924 status_push_ch(ptr, c);
2926 status_disable(ptr);
2931 if (0x80 <= c && c <= 0xbf){
2932 status_push_ch(ptr, c);
2933 if (ptr->index > ptr->stat){
2934 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2935 && ptr->buf[2] == 0xbf);
2936 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2937 &ptr->buf[0], &ptr->buf[1]);
2944 status_disable(ptr);
2948 if (0x80 <= c && c <= 0xbf){
2949 if (ptr->index < ptr->stat){
2950 status_push_ch(ptr, c);
2955 status_disable(ptr);
2963 code_status(nkf_char c)
2965 int action_flag = 1;
2966 struct input_code *result = 0;
2967 struct input_code *p = input_code_list;
2969 if (!p->status_func) {
2973 if (!p->status_func)
2975 (p->status_func)(p, c);
2978 }else if(p->stat == 0){
2989 if (result && !estab_f){
2990 set_iconv(TRUE, result->iconv_func);
2991 }else if (c <= DEL){
2992 struct input_code *ptr = input_code_list;
3002 nkf_buf_t *std_gc_buf;
3003 nkf_char broken_state;
3004 nkf_buf_t *broken_buf;
3005 nkf_char mimeout_state;
3009 static nkf_state_t *nkf_state = NULL;
3011 #define STD_GC_BUFSIZE (256)
3014 nkf_state_init(void)
3017 nkf_buf_clear(nkf_state->std_gc_buf);
3018 nkf_buf_clear(nkf_state->broken_buf);
3019 nkf_buf_clear(nkf_state->nfc_buf);
3022 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3023 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3024 nkf_state->broken_buf = nkf_buf_new(3);
3025 nkf_state->nfc_buf = nkf_buf_new(9);
3027 nkf_state->broken_state = 0;
3028 nkf_state->mimeout_state = 0;
3035 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3036 return nkf_buf_pop(nkf_state->std_gc_buf);
3043 std_ungetc(nkf_char c, FILE *f)
3045 nkf_buf_push(nkf_state->std_gc_buf, c);
3051 std_putc(nkf_char c)
3058 static nkf_char hold_buf[HOLD_SIZE*2];
3059 static int hold_count = 0;
3061 push_hold_buf(nkf_char c2)
3063 if (hold_count >= HOLD_SIZE*2)
3065 hold_buf[hold_count++] = c2;
3066 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3070 h_conv(FILE *f, nkf_char c1, nkf_char c2)
3076 /** it must NOT be in the kanji shifte sequence */
3077 /** it must NOT be written in JIS7 */
3078 /** and it must be after 2 byte 8bit code */
3084 while ((c2 = (*i_getc)(f)) != EOF) {
3090 if (push_hold_buf(c2) == EOF || estab_f) {
3096 struct input_code *p = input_code_list;
3097 struct input_code *result = p;
3102 if (p->status_func && p->score < result->score) {
3107 set_iconv(TRUE, result->iconv_func);
3112 ** 1) EOF is detected, or
3113 ** 2) Code is established, or
3114 ** 3) Buffer is FULL (but last word is pushed)
3116 ** in 1) and 3) cases, we continue to use
3117 ** Kanji codes by oconv and leave estab_f unchanged.
3122 while (hold_index < hold_count){
3123 c1 = hold_buf[hold_index++];
3124 if (nkf_char_unicode_p(c1)) {
3128 else if (c1 <= DEL){
3131 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3132 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3135 if (hold_index < hold_count){
3136 c2 = hold_buf[hold_index++];
3146 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3149 if (hold_index < hold_count){
3150 c3 = hold_buf[hold_index++];
3151 } else if ((c3 = (*i_getc)(f)) == EOF) {
3156 if (hold_index < hold_count){
3157 c4 = hold_buf[hold_index++];
3158 } else if ((c4 = (*i_getc)(f)) == EOF) {
3163 (*iconv)(c1, c2, (c3<<8)|c4);
3166 /* 3 bytes EUC or UTF-8 */
3167 if (hold_index < hold_count){
3168 c3 = hold_buf[hold_index++];
3169 } else if ((c3 = (*i_getc)(f)) == EOF) {
3175 (*iconv)(c1, c2, c3);
3178 if (c3 == EOF) break;
3184 * Check and Ignore BOM
3190 switch(c2 = (*i_getc)(f)){
3192 if((c2 = (*i_getc)(f)) == 0x00){
3193 if((c2 = (*i_getc)(f)) == 0xFE){
3194 if((c2 = (*i_getc)(f)) == 0xFF){
3195 if(!input_encoding){
3196 set_iconv(TRUE, w_iconv32);
3198 if (iconv == w_iconv32) {
3199 input_endian = ENDIAN_BIG;
3202 (*i_ungetc)(0xFF,f);
3203 }else (*i_ungetc)(c2,f);
3204 (*i_ungetc)(0xFE,f);
3205 }else if(c2 == 0xFF){
3206 if((c2 = (*i_getc)(f)) == 0xFE){
3207 if(!input_encoding){
3208 set_iconv(TRUE, w_iconv32);
3210 if (iconv == w_iconv32) {
3211 input_endian = ENDIAN_2143;
3214 (*i_ungetc)(0xFF,f);
3215 }else (*i_ungetc)(c2,f);
3216 (*i_ungetc)(0xFF,f);
3217 }else (*i_ungetc)(c2,f);
3218 (*i_ungetc)(0x00,f);
3219 }else (*i_ungetc)(c2,f);
3220 (*i_ungetc)(0x00,f);
3223 if((c2 = (*i_getc)(f)) == 0xBB){
3224 if((c2 = (*i_getc)(f)) == 0xBF){
3225 if(!input_encoding){
3226 set_iconv(TRUE, w_iconv);
3228 if (iconv == w_iconv) {
3231 (*i_ungetc)(0xBF,f);
3232 }else (*i_ungetc)(c2,f);
3233 (*i_ungetc)(0xBB,f);
3234 }else (*i_ungetc)(c2,f);
3235 (*i_ungetc)(0xEF,f);
3238 if((c2 = (*i_getc)(f)) == 0xFF){
3239 if((c2 = (*i_getc)(f)) == 0x00){
3240 if((c2 = (*i_getc)(f)) == 0x00){
3241 if(!input_encoding){
3242 set_iconv(TRUE, w_iconv32);
3244 if (iconv == w_iconv32) {
3245 input_endian = ENDIAN_3412;
3248 (*i_ungetc)(0x00,f);
3249 }else (*i_ungetc)(c2,f);
3250 (*i_ungetc)(0x00,f);
3251 }else (*i_ungetc)(c2,f);
3252 if(!input_encoding){
3253 set_iconv(TRUE, w_iconv16);
3255 if (iconv == w_iconv16) {
3256 input_endian = ENDIAN_BIG;
3259 (*i_ungetc)(0xFF,f);
3260 }else (*i_ungetc)(c2,f);
3261 (*i_ungetc)(0xFE,f);
3264 if((c2 = (*i_getc)(f)) == 0xFE){
3265 if((c2 = (*i_getc)(f)) == 0x00){
3266 if((c2 = (*i_getc)(f)) == 0x00){
3267 if(!input_encoding){
3268 set_iconv(TRUE, w_iconv32);
3270 if (iconv == w_iconv32) {
3271 input_endian = ENDIAN_LITTLE;
3274 (*i_ungetc)(0x00,f);
3275 }else (*i_ungetc)(c2,f);
3276 (*i_ungetc)(0x00,f);
3277 }else (*i_ungetc)(c2,f);
3278 if(!input_encoding){
3279 set_iconv(TRUE, w_iconv16);
3281 if (iconv == w_iconv16) {
3282 input_endian = ENDIAN_LITTLE;
3285 (*i_ungetc)(0xFE,f);
3286 }else (*i_ungetc)(c2,f);
3287 (*i_ungetc)(0xFF,f);
3296 broken_getc(FILE *f)
3300 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3301 return nkf_buf_pop(nkf_state->broken_buf);
3304 if (c=='$' && nkf_state->broken_state != ESC
3305 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3307 nkf_state->broken_state = 0;
3308 if (c1=='@'|| c1=='B') {
3309 nkf_buf_push(nkf_state->broken_buf, c1);
3310 nkf_buf_push(nkf_state->broken_buf, c);
3316 } else if (c=='(' && nkf_state->broken_state != ESC
3317 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3319 nkf_state->broken_state = 0;
3320 if (c1=='J'|| c1=='B') {
3321 nkf_buf_push(nkf_state->broken_buf, c1);
3322 nkf_buf_push(nkf_state->broken_buf, c);
3329 nkf_state->broken_state = c;
3335 broken_ungetc(nkf_char c, FILE *f)
3337 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3338 nkf_buf_push(nkf_state->broken_buf, c);
3343 eol_conv(nkf_char c2, nkf_char c1)
3345 if (guess_f && input_eol != EOF) {
3346 if (c2 == 0 && c1 == LF) {
3347 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3348 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3349 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3351 else if (!input_eol) input_eol = CR;
3352 else if (input_eol != CR) input_eol = EOF;
3354 if (prev_cr || (c2 == 0 && c1 == LF)) {
3356 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3357 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3359 if (c2 == 0 && c1 == CR) prev_cr = CR;
3360 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3364 put_newline(void (*func)(nkf_char))
3366 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3381 oconv_newline(void (*func)(nkf_char, nkf_char))
3383 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3398 Return value of fold_conv()
3400 LF add newline and output char
3401 CR add newline and output nothing
3404 1 (or else) normal output
3406 fold state in prev (previous character)
3408 >0x80 Japanese (X0208/X0201)
3413 This fold algorthm does not preserve heading space in a line.
3414 This is the main difference from fmt.
3417 #define char_size(c2,c1) (c2?2:1)
3420 fold_conv(nkf_char c2, nkf_char c1)
3423 nkf_char fold_state;
3425 if (c1== CR && !fold_preserve_f) {
3426 fold_state=0; /* ignore cr */
3427 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3429 fold_state=0; /* ignore cr */
3430 } else if (c1== BS) {
3431 if (f_line>0) f_line--;
3433 } else if (c2==EOF && f_line != 0) { /* close open last line */
3435 } else if ((c1==LF && !fold_preserve_f)
3436 || ((c1==CR||(c1==LF&&f_prev!=CR))
3437 && fold_preserve_f)) {
3439 if (fold_preserve_f) {
3443 } else if ((f_prev == c1 && !fold_preserve_f)
3444 || (f_prev == LF && fold_preserve_f)
3445 ) { /* duplicate newline */
3448 fold_state = LF; /* output two newline */
3454 if (f_prev&0x80) { /* Japanese? */
3456 fold_state = 0; /* ignore given single newline */
3457 } else if (f_prev==SP) {
3461 if (++f_line<=fold_len)
3465 fold_state = CR; /* fold and output nothing */
3469 } else if (c1=='\f') {
3472 fold_state = LF; /* output newline and clear */
3473 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3474 /* X0208 kankaku or ascii space */
3476 fold_state = 0; /* remove duplicate spaces */
3479 if (++f_line<=fold_len)
3480 fold_state = SP; /* output ASCII space only */
3482 f_prev = SP; f_line = 0;
3483 fold_state = CR; /* fold and output nothing */
3487 prev0 = f_prev; /* we still need this one... , but almost done */
3489 if (c2 || c2 == JIS_X_0201_1976_K)
3490 f_prev |= 0x80; /* this is Japanese */
3491 f_line += char_size(c2,c1);
3492 if (f_line<=fold_len) { /* normal case */
3495 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3496 f_line = char_size(c2,c1);
3497 fold_state = LF; /* We can't wait, do fold now */
3498 } else if (c2 == JIS_X_0201_1976_K) {
3499 /* simple kinsoku rules return 1 means no folding */
3500 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3501 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3502 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3503 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3504 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3505 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3506 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3508 fold_state = LF;/* add one new f_line before this character */
3511 fold_state = LF;/* add one new f_line before this character */
3514 /* kinsoku point in ASCII */
3515 if ( c1==')'|| /* { [ ( */
3526 /* just after special */
3527 } else if (!is_alnum(prev0)) {
3528 f_line = char_size(c2,c1);
3530 } else if ((prev0==SP) || /* ignored new f_line */
3531 (prev0==LF)|| /* ignored new f_line */
3532 (prev0&0x80)) { /* X0208 - ASCII */
3533 f_line = char_size(c2,c1);
3534 fold_state = LF;/* add one new f_line before this character */
3536 fold_state = 1; /* default no fold in ASCII */
3540 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3541 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3542 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3543 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3544 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3545 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3546 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3547 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3548 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3549 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3550 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3551 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3552 /* default no fold in kinsoku */
3555 f_line = char_size(c2,c1);
3556 /* add one new f_line before this character */
3559 f_line = char_size(c2,c1);
3561 /* add one new f_line before this character */
3566 /* terminator process */
3567 switch(fold_state) {
3569 oconv_newline(o_fconv);
3575 oconv_newline(o_fconv);
3586 static nkf_char z_prev2=0,z_prev1=0;
3589 z_conv(nkf_char c2, nkf_char c1)
3592 /* if (c2) c1 &= 0x7f; assertion */
3594 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3600 if (z_prev2 == JIS_X_0201_1976_K) {
3601 if (c2 == JIS_X_0201_1976_K) {
3602 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3604 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3606 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3608 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3613 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3615 if (c2 == JIS_X_0201_1976_K) {
3616 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3617 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3622 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3633 if (alpha_f&1 && c2 == 0x23) {
3634 /* JISX0208 Alphabet */
3636 } else if (c2 == 0x21) {
3637 /* JISX0208 Kigou */
3642 } else if (alpha_f&4) {
3647 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3653 if (alpha_f&8 && c2 == 0) {
3655 const char *entity = 0;
3657 case '>': entity = ">"; break;
3658 case '<': entity = "<"; break;
3659 case '\"': entity = """; break;
3660 case '&': entity = "&"; break;
3663 while (*entity) (*o_zconv)(0, *entity++);
3669 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3674 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3678 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3682 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3686 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3690 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3694 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3698 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3702 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3707 (*o_zconv)(JIS_X_0201_1976_K, c);
3710 } else if (c2 == 0x25) {
3711 /* JISX0208 Katakana */
3712 static const int fullwidth_to_halfwidth[] =
3714 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3715 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3716 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3717 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3718 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3719 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3720 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3721 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3722 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3723 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3724 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3725 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3727 if (fullwidth_to_halfwidth[c1-0x20]){
3728 c2 = fullwidth_to_halfwidth[c1-0x20];
3729 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3731 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3741 #define rot13(c) ( \
3743 (c <= 'M') ? (c + 13): \
3744 (c <= 'Z') ? (c - 13): \
3746 (c <= 'm') ? (c + 13): \
3747 (c <= 'z') ? (c - 13): \
3751 #define rot47(c) ( \
3753 ( c <= 'O') ? (c + 47) : \
3754 ( c <= '~') ? (c - 47) : \
3759 rot_conv(nkf_char c2, nkf_char c1)
3761 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3767 (*o_rot_conv)(c2,c1);
3771 hira_conv(nkf_char c2, nkf_char c1)
3775 if (0x20 < c1 && c1 < 0x74) {
3777 (*o_hira_conv)(c2,c1);
3779 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3781 c1 = nkf_char_unicode_new(0x3094);
3782 (*o_hira_conv)(c2,c1);
3785 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3787 (*o_hira_conv)(c2,c1);
3792 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3795 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3797 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3801 (*o_hira_conv)(c2,c1);
3806 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3808 #define RANGE_NUM_MAX 18
3809 static const nkf_char range[RANGE_NUM_MAX][2] = {
3830 nkf_char start, end, c;
3832 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3836 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3841 for (i = 0; i < RANGE_NUM_MAX; i++) {
3842 start = range[i][0];
3845 if (c >= start && c <= end) {
3850 (*o_iso2022jp_check_conv)(c2,c1);
3854 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3856 static const unsigned char *mime_pattern[] = {
3857 (const unsigned char *)"\075?EUC-JP?B?",
3858 (const unsigned char *)"\075?SHIFT_JIS?B?",
3859 (const unsigned char *)"\075?ISO-8859-1?Q?",
3860 (const unsigned char *)"\075?ISO-8859-1?B?",
3861 (const unsigned char *)"\075?ISO-2022-JP?B?",
3862 (const unsigned char *)"\075?ISO-2022-JP?B?",
3863 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3864 #if defined(UTF8_INPUT_ENABLE)
3865 (const unsigned char *)"\075?UTF-8?B?",
3866 (const unsigned char *)"\075?UTF-8?Q?",
3868 (const unsigned char *)"\075?US-ASCII?Q?",
3873 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3874 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3875 e_iconv, s_iconv, 0, 0, 0, 0,
3876 #if defined(UTF8_INPUT_ENABLE)
3882 static const nkf_char mime_encode[] = {
3883 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K,
3884 #if defined(UTF8_INPUT_ENABLE)
3891 static const nkf_char mime_encode_method[] = {
3892 'B', 'B','Q', 'B', 'B', 'B', 'Q',
3893 #if defined(UTF8_INPUT_ENABLE)
3901 /* MIME preprocessor fifo */
3903 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3904 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3905 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3907 unsigned char buf[MIME_BUF_SIZE];
3909 unsigned int last; /* decoded */
3910 unsigned int input; /* undecoded */
3912 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3914 #define MAXRECOVER 20
3917 mime_input_buf_unshift(nkf_char c)
3919 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3923 mime_ungetc(nkf_char c, FILE *f)
3925 mime_input_buf_unshift(c);
3930 mime_ungetc_buf(nkf_char c, FILE *f)
3933 (*i_mungetc_buf)(c,f);
3935 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3940 mime_getc_buf(FILE *f)
3942 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3943 a terminator. It was checked in mime_integrity. */
3944 return ((mimebuf_f)?
3945 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3949 switch_mime_getc(void)
3951 if (i_getc!=mime_getc) {
3952 i_mgetc = i_getc; i_getc = mime_getc;
3953 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3954 if(mime_f==STRICT_MIME) {
3955 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3956 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3962 unswitch_mime_getc(void)
3964 if(mime_f==STRICT_MIME) {
3965 i_mgetc = i_mgetc_buf;
3966 i_mungetc = i_mungetc_buf;
3969 i_ungetc = i_mungetc;
3970 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3971 mime_iconv_back = NULL;
3975 mime_integrity(FILE *f, const unsigned char *p)
3979 /* In buffered mode, read until =? or NL or buffer full
3981 mime_input_state.input = mime_input_state.top;
3982 mime_input_state.last = mime_input_state.top;
3984 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3986 q = mime_input_state.input;
3987 while((c=(*i_getc)(f))!=EOF) {
3988 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3989 break; /* buffer full */
3991 if (c=='=' && d=='?') {
3992 /* checked. skip header, start decode */
3993 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3994 /* mime_last_input = mime_input_state.input; */
3995 mime_input_state.input = q;
3999 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4001 /* Should we check length mod 4? */
4002 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4005 /* In case of Incomplete MIME, no MIME decode */
4006 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4007 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
4008 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
4009 switch_mime_getc(); /* anyway we need buffered getc */
4014 mime_begin_strict(FILE *f)
4018 const unsigned char *p,*q;
4019 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4021 mime_decode_mode = FALSE;
4022 /* =? has been checked */
4024 p = mime_pattern[j];
4027 for(i=2;p[i]>SP;i++) { /* start at =? */
4028 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4029 /* pattern fails, try next one */
4031 while (mime_pattern[++j]) {
4032 p = mime_pattern[j];
4033 for(k=2;k<i;k++) /* assume length(p) > i */
4034 if (p[k]!=q[k]) break;
4035 if (k==i && nkf_toupper(c1)==p[k]) break;
4037 p = mime_pattern[j];
4038 if (p) continue; /* found next one, continue */
4039 /* all fails, output from recovery buffer */
4047 mime_decode_mode = p[i-2];
4049 mime_iconv_back = iconv;
4050 set_iconv(FALSE, mime_priority_func[j]);
4051 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4053 if (mime_decode_mode=='B') {
4054 mimebuf_f = unbuf_f;
4056 /* do MIME integrity check */
4057 return mime_integrity(f,mime_pattern[j]);
4071 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4072 /* re-read and convert again from mime_buffer. */
4074 /* =? has been checked */
4075 k = mime_input_state.last;
4076 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4077 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4078 /* We accept any character type even if it is breaked by new lines */
4079 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4080 if (c1==LF||c1==SP||c1==CR||
4081 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4083 /* Failed. But this could be another MIME preemble */
4085 mime_input_state.last--;
4091 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4092 if (!(++i<MAXRECOVER) || c1==EOF) break;
4093 if (c1=='b'||c1=='B') {
4094 mime_decode_mode = 'B';
4095 } else if (c1=='q'||c1=='Q') {
4096 mime_decode_mode = 'Q';
4100 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4101 if (!(++i<MAXRECOVER) || c1==EOF) break;
4103 mime_decode_mode = FALSE;
4109 if (!mime_decode_mode) {
4110 /* false MIME premble, restart from mime_buffer */
4111 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4112 /* Since we are in MIME mode until buffer becomes empty, */
4113 /* we never go into mime_begin again for a while. */
4116 /* discard mime preemble, and goto MIME mode */
4117 mime_input_state.last = k;
4118 /* do no MIME integrity check */
4119 return c1; /* used only for checking EOF */
4130 debug(const char *str)
4133 fprintf(stderr, "%s\n", str ? str : "NULL");
4139 set_input_codename(const char *codename)
4141 if (!input_codename) {
4142 input_codename = codename;
4143 } else if (strcmp(codename, input_codename) != 0) {
4144 input_codename = "";
4149 get_guessed_code(void)
4151 if (input_codename && !*input_codename) {
4152 input_codename = "BINARY";
4154 struct input_code *p = find_inputcode_byfunc(iconv);
4155 if (!input_codename) {
4156 input_codename = "ASCII";
4157 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4158 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4159 input_codename = "CP932";
4160 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4161 if (p->score & (SCORE_X0212))
4162 input_codename = "EUCJP-MS";
4163 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4164 input_codename = "CP51932";
4165 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4166 if (p->score & (SCORE_KANA))
4167 input_codename = "CP50221";
4168 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4169 input_codename = "CP50220";
4172 return input_codename;
4175 #if !defined(PERL_XS) && !defined(WIN32DLL)
4177 print_guessed_code(char *filename)
4179 if (filename != NULL) printf("%s: ", filename);
4180 if (input_codename && !*input_codename) {
4183 input_codename = get_guessed_code();
4185 printf("%s\n", input_codename);
4189 input_eol == CR ? " (CR)" :
4190 input_eol == LF ? " (LF)" :
4191 input_eol == CRLF ? " (CRLF)" :
4192 input_eol == EOF ? " (MIXED NL)" :
4202 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4204 nkf_char c1, c2, c3;
4210 if (!nkf_isxdigit(c2)){
4215 if (!nkf_isxdigit(c3)){
4220 return (hex2bin(c2) << 4) | hex2bin(c3);
4226 return hex_getc(':', f, i_cgetc, i_cungetc);
4230 cap_ungetc(nkf_char c, FILE *f)
4232 return (*i_cungetc)(c, f);
4238 return hex_getc('%', f, i_ugetc, i_uungetc);
4242 url_ungetc(nkf_char c, FILE *f)
4244 return (*i_uungetc)(c, f);
4248 #ifdef NUMCHAR_OPTION
4250 numchar_getc(FILE *f)
4252 nkf_char (*g)(FILE *) = i_ngetc;
4253 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4264 if (buf[i] == 'x' || buf[i] == 'X'){
4265 for (j = 0; j < 7; j++){
4267 if (!nkf_isxdigit(buf[i])){
4274 c |= hex2bin(buf[i]);
4277 for (j = 0; j < 8; j++){
4281 if (!nkf_isdigit(buf[i])){
4288 c += hex2bin(buf[i]);
4294 return nkf_char_unicode_new(c);
4304 numchar_ungetc(nkf_char c, FILE *f)
4306 return (*i_nungetc)(c, f);
4310 #ifdef UNICODE_NORMALIZATION
4315 nkf_char (*g)(FILE *f) = i_nfc_getc;
4316 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4317 nkf_buf_t *buf = nkf_state->nfc_buf;
4318 const unsigned char *array;
4319 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4320 nkf_char c = (*g)(f);
4322 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4324 nkf_buf_push(buf, c);
4326 while (lower <= upper) {
4327 int mid = (lower+upper) / 2;
4329 array = normalization_table[mid].nfd;
4330 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4331 if (len >= nkf_buf_length(buf)) {
4335 lower = 1, upper = 0;
4338 nkf_buf_push(buf, c);
4340 if (array[len] != nkf_buf_at(buf, len)) {
4341 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4342 else upper = mid - 1;
4349 array = normalization_table[mid].nfc;
4351 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4352 nkf_buf_push(buf, array[i]);
4356 } while (lower <= upper);
4358 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4359 c = nkf_buf_pop(buf);
4365 nfc_ungetc(nkf_char c, FILE *f)
4367 return (*i_nfc_ungetc)(c, f);
4369 #endif /* UNICODE_NORMALIZATION */
4373 base64decode(nkf_char c)
4378 i = c - 'A'; /* A..Z 0-25 */
4379 } else if (c == '_') {
4380 i = '?' /* 63 */ ; /* _ 63 */
4382 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4384 } else if (c > '/') {
4385 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4386 } else if (c == '+' || c == '-') {
4387 i = '>' /* 62 */ ; /* + and - 62 */
4389 i = '?' /* 63 */ ; /* / 63 */
4397 nkf_char c1, c2, c3, c4, cc;
4398 nkf_char t1, t2, t3, t4, mode, exit_mode;
4399 nkf_char lwsp_count;
4402 nkf_char lwsp_size = 128;
4404 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4405 return mime_input_buf(mime_input_state.top++);
4407 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4408 mime_decode_mode=FALSE;
4409 unswitch_mime_getc();
4410 return (*i_getc)(f);
4413 if (mimebuf_f == FIXED_MIME)
4414 exit_mode = mime_decode_mode;
4417 if (mime_decode_mode == 'Q') {
4418 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4420 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4421 if (c1<=SP || DEL<=c1) {
4422 mime_decode_mode = exit_mode; /* prepare for quit */
4425 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4429 mime_decode_mode = exit_mode; /* prepare for quit */
4430 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4431 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4432 /* end Q encoding */
4433 input_mode = exit_mode;
4435 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4436 while ((c1=(*i_getc)(f))!=EOF) {
4441 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4449 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4450 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4465 lwsp_buf[lwsp_count] = (unsigned char)c1;
4466 if (lwsp_count++>lwsp_size){
4468 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4469 lwsp_buf = lwsp_buf_new;
4475 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4477 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4478 i_ungetc(lwsp_buf[lwsp_count],f);
4481 nkf_xfree(lwsp_buf);
4484 if (c1=='='&&c2<SP) { /* this is soft wrap */
4485 while((c1 = (*i_mgetc)(f)) <=SP) {
4486 if (c1 == EOF) return (EOF);
4488 mime_decode_mode = 'Q'; /* still in MIME */
4489 goto restart_mime_q;
4492 mime_decode_mode = 'Q'; /* still in MIME */
4496 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4497 if (c2<=SP) return c2;
4498 mime_decode_mode = 'Q'; /* still in MIME */
4499 return ((hex2bin(c2)<<4) + hex2bin(c3));
4502 if (mime_decode_mode != 'B') {
4503 mime_decode_mode = FALSE;
4504 return (*i_mgetc)(f);
4508 /* Base64 encoding */
4510 MIME allows line break in the middle of
4511 Base64, but we are very pessimistic in decoding
4512 in unbuf mode because MIME encoded code may broken by
4513 less or editor's control sequence (such as ESC-[-K in unbuffered
4514 mode. ignore incomplete MIME.
4516 mode = mime_decode_mode;
4517 mime_decode_mode = exit_mode; /* prepare for quit */
4519 while ((c1 = (*i_mgetc)(f))<=SP) {
4524 if ((c2 = (*i_mgetc)(f))<=SP) {
4527 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4528 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4531 if ((c1 == '?') && (c2 == '=')) {
4534 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4535 while ((c1=(*i_getc)(f))!=EOF) {
4540 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4548 if ((c1=(*i_getc)(f))!=EOF) {
4552 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4567 lwsp_buf[lwsp_count] = (unsigned char)c1;
4568 if (lwsp_count++>lwsp_size){
4570 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4571 lwsp_buf = lwsp_buf_new;
4577 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4579 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4580 i_ungetc(lwsp_buf[lwsp_count],f);
4583 nkf_xfree(lwsp_buf);
4587 if ((c3 = (*i_mgetc)(f))<=SP) {
4590 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4591 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4595 if ((c4 = (*i_mgetc)(f))<=SP) {
4598 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4599 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4603 mime_decode_mode = mode; /* still in MIME sigh... */
4605 /* BASE 64 decoding */
4607 t1 = 0x3f & base64decode(c1);
4608 t2 = 0x3f & base64decode(c2);
4609 t3 = 0x3f & base64decode(c3);
4610 t4 = 0x3f & base64decode(c4);
4611 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4613 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4614 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4616 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4617 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4619 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4624 return mime_input_buf(mime_input_state.top++);
4627 static const char basis_64[] =
4628 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4630 #define MIMEOUT_BUF_LENGTH 74
4632 char buf[MIMEOUT_BUF_LENGTH+1];
4636 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4639 open_mime(nkf_char mode)
4641 const unsigned char *p;
4644 p = mime_pattern[0];
4645 for(i=0;mime_pattern[i];i++) {
4646 if (mode == mime_encode[i]) {
4647 p = mime_pattern[i];
4651 mimeout_mode = mime_encode_method[i];
4653 if (base64_count>45) {
4654 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4655 (*o_mputc)(mimeout_state.buf[i]);
4658 put_newline(o_mputc);
4661 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
4665 for (;i<mimeout_state.count;i++) {
4666 if (nkf_isspace(mimeout_state.buf[i])) {
4667 (*o_mputc)(mimeout_state.buf[i]);
4677 j = mimeout_state.count;
4678 mimeout_state.count = 0;
4680 mime_putc(mimeout_state.buf[i]);
4685 mime_prechar(nkf_char c2, nkf_char c1)
4687 if (mimeout_mode > 0){
4689 if (base64_count + mimeout_state.count/3*4> 73){
4690 (*o_base64conv)(EOF,0);
4691 oconv_newline(o_base64conv);
4692 (*o_base64conv)(0,SP);
4696 if (!(c2 == 0 && (c1 == CR || c1 == LF)) &&
4697 base64_count + mimeout_state.count/3*4> 66) {
4698 (*o_base64conv)(EOF,0);
4699 oconv_newline(o_base64conv);
4700 (*o_base64conv)(0,SP);
4706 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4707 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4708 open_mime(output_mode);
4709 (*o_base64conv)(EOF,0);
4710 oconv_newline(o_base64conv);
4711 (*o_base64conv)(0,SP);
4730 switch(mimeout_mode) {
4735 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
4741 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
4746 if (mimeout_mode > 0) {
4747 if (mimeout_f!=FIXED_MIME) {
4749 } else if (mimeout_mode != 'Q')
4755 mimeout_addchar(nkf_char c)
4757 switch(mimeout_mode) {
4762 } else if(!nkf_isalnum(c)) {
4764 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4765 (*o_mputc)(bin2hex((c&0xf)));
4773 nkf_state->mimeout_state=c;
4774 (*o_mputc)(basis_64[c>>2]);
4779 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4780 nkf_state->mimeout_state=c;
4785 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4786 (*o_mputc)(basis_64[c & 0x3F]);
4798 mime_putc(nkf_char c)
4803 if (mimeout_f == FIXED_MIME){
4804 if (mimeout_mode == 'Q'){
4805 if (base64_count > 71){
4806 if (c!=CR && c!=LF) {
4808 put_newline(o_mputc);
4813 if (base64_count > 71){
4815 put_newline(o_mputc);
4818 if (c == EOF) { /* c==EOF */
4822 if (c != EOF) { /* c==EOF */
4828 /* mimeout_f != FIXED_MIME */
4830 if (c == EOF) { /* c==EOF */
4831 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4832 j = mimeout_state.count;
4833 mimeout_state.count = 0;
4835 if (mimeout_mode > 0) {
4836 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4838 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4841 mimeout_addchar(mimeout_state.buf[i]);
4845 mimeout_addchar(mimeout_state.buf[i]);
4849 mimeout_addchar(mimeout_state.buf[i]);
4855 mimeout_addchar(mimeout_state.buf[i]);
4861 if (mimeout_state.count > 0){
4862 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4867 if (mimeout_mode=='Q') {
4868 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4869 if (c == CR || c == LF) {
4874 } else if (c <= SP) {
4876 if (base64_count > 70) {
4877 put_newline(o_mputc);
4880 if (!nkf_isblank(c)) {
4885 if (base64_count > 70) {
4887 put_newline(o_mputc);
4890 open_mime(output_mode);
4892 if (!nkf_noescape_mime(c)) {
4905 if (mimeout_mode <= 0) {
4906 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
4907 output_mode == UTF_8)) {
4908 if (nkf_isspace(c)) {
4910 if (mimeout_mode == -1) {
4913 if (c==CR || c==LF) {
4915 open_mime(output_mode);
4921 for (i=0;i<mimeout_state.count;i++) {
4922 (*o_mputc)(mimeout_state.buf[i]);
4923 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4934 mimeout_state.buf[0] = (char)c;
4935 mimeout_state.count = 1;
4937 if (base64_count > 1
4938 && base64_count + mimeout_state.count > 76
4939 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4940 static const char *str = "boundary=\"";
4941 static int len = 10;
4944 for (; i < mimeout_state.count - len; ++i) {
4945 if (!strncmp(mimeout_state.buf+i, str, len)) {
4951 if (i == 0 || i == mimeout_state.count - len) {
4952 put_newline(o_mputc);
4954 if (!nkf_isspace(mimeout_state.buf[0])){
4961 for (j = 0; j <= i; ++j) {
4962 (*o_mputc)(mimeout_state.buf[j]);
4964 put_newline(o_mputc);
4966 for (; j <= mimeout_state.count; ++j) {
4967 mimeout_state.buf[j - i] = mimeout_state.buf[j];
4969 mimeout_state.count -= i;
4972 mimeout_state.buf[mimeout_state.count++] = (char)c;
4973 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4974 open_mime(output_mode);
4979 if (lastchar==CR || lastchar == LF){
4980 for (i=0;i<mimeout_state.count;i++) {
4981 (*o_mputc)(mimeout_state.buf[i]);
4984 mimeout_state.count = 0;
4987 for (i=0;i<mimeout_state.count-1;i++) {
4988 (*o_mputc)(mimeout_state.buf[i]);
4991 mimeout_state.buf[0] = SP;
4992 mimeout_state.count = 1;
4994 open_mime(output_mode);
4997 /* mimeout_mode == 'B', 1, 2 */
4998 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
4999 output_mode == UTF_8)) {
5000 if (lastchar == CR || lastchar == LF){
5001 if (nkf_isblank(c)) {
5002 for (i=0;i<mimeout_state.count;i++) {
5003 mimeout_addchar(mimeout_state.buf[i]);
5005 mimeout_state.count = 0;
5008 for (i=0;i<mimeout_state.count;i++) {
5009 (*o_mputc)(mimeout_state.buf[i]);
5012 mimeout_state.count = 0;
5014 mimeout_state.buf[mimeout_state.count++] = (char)c;
5017 if (nkf_isspace(c)) {
5018 for (i=0;i<mimeout_state.count;i++) {
5019 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
5021 for (i=0;i<mimeout_state.count;i++) {
5022 (*o_mputc)(mimeout_state.buf[i]);
5025 mimeout_state.count = 0;
5028 mimeout_state.buf[mimeout_state.count++] = (char)c;
5029 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5031 for (i=0;i<mimeout_state.count;i++) {
5032 (*o_mputc)(mimeout_state.buf[i]);
5035 mimeout_state.count = 0;
5039 if (mimeout_state.count>0 && SP<c && c!='=') {
5040 mimeout_state.buf[mimeout_state.count++] = (char)c;
5041 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5042 j = mimeout_state.count;
5043 mimeout_state.count = 0;
5045 mimeout_addchar(mimeout_state.buf[i]);
5052 if (mimeout_state.count>0) {
5053 j = mimeout_state.count;
5054 mimeout_state.count = 0;
5056 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5058 mimeout_addchar(mimeout_state.buf[i]);
5064 (*o_mputc)(mimeout_state.buf[i]);
5066 open_mime(output_mode);
5073 base64_conv(nkf_char c2, nkf_char c1)
5075 mime_prechar(c2, c1);
5076 (*o_base64conv)(c2,c1);
5080 typedef struct nkf_iconv_t {
5083 size_t input_buffer_size;
5084 char *output_buffer;
5085 size_t output_buffer_size;
5089 nkf_iconv_new(char *tocode, char *fromcode)
5091 nkf_iconv_t converter;
5093 converter->input_buffer_size = IOBUF_SIZE;
5094 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5095 converter->output_buffer_size = IOBUF_SIZE * 2;
5096 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5097 converter->cd = iconv_open(tocode, fromcode);
5098 if (converter->cd == (iconv_t)-1)
5102 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5105 perror("can't iconv_open");
5111 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5113 size_t invalid = (size_t)0;
5114 char *input_buffer = converter->input_buffer;
5115 size_t input_length = (size_t)0;
5116 char *output_buffer = converter->output_buffer;
5117 size_t output_length = converter->output_buffer_size;
5122 while ((c = (*i_getc)(f)) != EOF) {
5123 input_buffer[input_length++] = c;
5124 if (input_length < converter->input_buffer_size) break;
5128 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5129 while (output_length-- > 0) {
5130 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5132 if (ret == (size_t) - 1) {
5135 if (input_buffer != converter->input_buffer)
5136 memmove(converter->input_buffer, input_buffer, input_length);
5139 converter->output_buffer_size *= 2;
5140 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5141 if (output_buffer == NULL) {
5142 perror("can't realloc");
5145 converter->output_buffer = output_buffer;
5148 perror("can't iconv");
5161 nkf_iconv_close(nkf_iconv_t *convert)
5163 nkf_xfree(converter->inbuf);
5164 nkf_xfree(converter->outbuf);
5165 iconv_close(converter->cd);
5174 struct input_code *p = input_code_list;
5186 mime_f = MIME_DECODE_DEFAULT;
5187 mime_decode_f = FALSE;
5192 x0201_f = X0201_DEFAULT;
5193 iso2022jp_f = FALSE;
5194 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5195 ms_ucs_map_f = UCS_MAP_ASCII;
5197 #ifdef UTF8_INPUT_ENABLE
5198 no_cp932ext_f = FALSE;
5199 no_best_fit_chars_f = FALSE;
5200 encode_fallback = NULL;
5201 unicode_subchar = '?';
5202 input_endian = ENDIAN_BIG;
5204 #ifdef UTF8_OUTPUT_ENABLE
5205 output_bom_f = FALSE;
5206 output_endian = ENDIAN_BIG;
5208 #ifdef UNICODE_NORMALIZATION
5224 #ifdef SHIFTJIS_CP932
5234 for (i = 0; i < 256; i++){
5235 prefix_table[i] = 0;
5239 mimeout_state.count = 0;
5244 fold_preserve_f = FALSE;
5247 kanji_intro = DEFAULT_J;
5248 ascii_intro = DEFAULT_R;
5249 fold_margin = FOLD_MARGIN;
5250 o_zconv = no_connection;
5251 o_fconv = no_connection;
5252 o_eol_conv = no_connection;
5253 o_rot_conv = no_connection;
5254 o_hira_conv = no_connection;
5255 o_base64conv = no_connection;
5256 o_iso2022jp_check_conv = no_connection;
5259 i_ungetc = std_ungetc;
5261 i_bungetc = std_ungetc;
5264 i_mungetc = std_ungetc;
5265 i_mgetc_buf = std_getc;
5266 i_mungetc_buf = std_ungetc;
5267 output_mode = ASCII;
5269 mime_decode_mode = FALSE;
5275 z_prev2=0,z_prev1=0;
5277 iconv_for_check = 0;
5279 input_codename = NULL;
5280 input_encoding = NULL;
5281 output_encoding = NULL;
5289 module_connection(void)
5291 if (input_encoding) set_input_encoding(input_encoding);
5292 if (!output_encoding) {
5293 output_encoding = nkf_default_encoding();
5295 if (!output_encoding) {
5296 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5299 set_output_encoding(output_encoding);
5300 oconv = nkf_enc_to_oconv(output_encoding);
5302 if (nkf_enc_unicode_p(output_encoding))
5303 output_mode = UTF_8;
5305 /* replace continucation module, from output side */
5307 /* output redicrection */
5309 if (noout_f || guess_f){
5316 if (mimeout_f == TRUE) {
5317 o_base64conv = oconv; oconv = base64_conv;
5319 /* base64_count = 0; */
5322 if (eolmode_f || guess_f) {
5323 o_eol_conv = oconv; oconv = eol_conv;
5326 o_rot_conv = oconv; oconv = rot_conv;
5329 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5332 o_hira_conv = oconv; oconv = hira_conv;
5335 o_fconv = oconv; oconv = fold_conv;
5338 if (alpha_f || x0201_f) {
5339 o_zconv = oconv; oconv = z_conv;
5343 i_ungetc = std_ungetc;
5344 /* input redicrection */
5347 i_cgetc = i_getc; i_getc = cap_getc;
5348 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5351 i_ugetc = i_getc; i_getc = url_getc;
5352 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5355 #ifdef NUMCHAR_OPTION
5357 i_ngetc = i_getc; i_getc = numchar_getc;
5358 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5361 #ifdef UNICODE_NORMALIZATION
5363 i_nfc_getc = i_getc; i_getc = nfc_getc;
5364 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5367 if (mime_f && mimebuf_f==FIXED_MIME) {
5368 i_mgetc = i_getc; i_getc = mime_getc;
5369 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5372 i_bgetc = i_getc; i_getc = broken_getc;
5373 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5375 if (input_encoding) {
5376 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5378 set_iconv(FALSE, e_iconv);
5382 struct input_code *p = input_code_list;
5391 Conversion main loop. Code detection only.
5394 #if !defined(PERL_XS) && !defined(WIN32DLL)
5401 module_connection();
5402 while ((c = (*i_getc)(f)) != EOF)
5409 #define NEXT continue /* no output, get next */
5410 #define SKIP c2=0;continue /* no output, get next */
5411 #define MORE c2=c1;continue /* need one more byte */
5412 #define SEND ; /* output c1 and c2, get next */
5413 #define LAST break /* end of loop, go closing */
5414 #define set_input_mode(mode) do { \
5415 input_mode = mode; \
5417 set_input_codename("ISO-2022-JP"); \
5418 debug("ISO-2022-JP"); \
5422 kanji_convert(FILE *f)
5424 nkf_char c1=0, c2=0, c3=0, c4=0;
5425 int shift_mode = 0; /* 0, 1, 2, 3 */
5427 int is_8bit = FALSE;
5429 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5434 output_mode = ASCII;
5436 if (module_connection() < 0) {
5437 #if !defined(PERL_XS) && !defined(WIN32DLL)
5438 fprintf(stderr, "no output encoding given\n");
5444 #ifdef UTF8_INPUT_ENABLE
5445 if(iconv == w_iconv32){
5446 while ((c1 = (*i_getc)(f)) != EOF &&
5447 (c2 = (*i_getc)(f)) != EOF &&
5448 (c3 = (*i_getc)(f)) != EOF &&
5449 (c4 = (*i_getc)(f)) != EOF) {
5450 nkf_iconv_utf_32(c1, c2, c3, c4);
5454 else if (iconv == w_iconv16) {
5455 while ((c1 = (*i_getc)(f)) != EOF &&
5456 (c2 = (*i_getc)(f)) != EOF) {
5457 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5458 (c3 = (*i_getc)(f)) != EOF &&
5459 (c4 = (*i_getc)(f)) != EOF) {
5460 nkf_iconv_utf_16(c1, c2, c3, c4);
5467 while ((c1 = (*i_getc)(f)) != EOF) {
5468 #ifdef INPUT_CODE_FIX
5469 if (!input_encoding)
5475 /* in case of 8th bit is on */
5476 if (!estab_f&&!mime_decode_mode) {
5477 /* in case of not established yet */
5478 /* It is still ambiguious */
5479 if (h_conv(f, c2, c1)==EOF) {
5487 /* in case of already established */
5489 /* ignore bogus code */
5497 /* 2nd byte of 7 bit code or SJIS */
5501 else if (nkf_char_unicode_p(c1)) {
5507 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5510 }else if (input_codename && input_codename[0] == 'I' &&
5511 0xA1 <= c1 && c1 <= 0xDF) {
5512 /* JIS X 0201 Katakana in 8bit JIS */
5513 c2 = JIS_X_0201_1976_K;
5516 } else if (c1 > DEL) {
5518 if (!estab_f && !iso8859_f) {
5519 /* not established yet */
5521 } else { /* estab_f==TRUE */
5527 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5528 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5530 c2 = JIS_X_0201_1976_K;
5535 /* already established */
5539 } else if (SP < c1 && c1 < DEL) {
5540 /* in case of Roman characters */
5542 /* output 1 shifted byte */
5546 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5547 /* output 1 shifted byte */
5548 c2 = JIS_X_0201_1976_K;
5551 /* look like bogus code */
5554 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5555 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5556 /* in case of Kanji shifted */
5558 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5559 /* Check MIME code */
5560 if ((c1 = (*i_getc)(f)) == EOF) {
5563 } else if (c1 == '?') {
5564 /* =? is mime conversion start sequence */
5565 if(mime_f == STRICT_MIME) {
5566 /* check in real detail */
5567 if (mime_begin_strict(f) == EOF)
5570 } else if (mime_begin(f) == EOF)
5579 /* normal ASCII code */
5582 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5585 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5588 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5589 if ((c1 = (*i_getc)(f)) == EOF) {
5593 else if (c1 == '&') {
5595 if ((c1 = (*i_getc)(f)) == EOF) {
5601 else if (c1 == '$') {
5603 if ((c1 = (*i_getc)(f)) == EOF) {
5604 /* don't send bogus code
5606 (*oconv)(0, '$'); */
5608 } else if (c1 == '@' || c1 == 'B') {
5610 set_input_mode(JIS_X_0208);
5612 } else if (c1 == '(') {
5614 if ((c1 = (*i_getc)(f)) == EOF) {
5615 /* don't send bogus code
5621 } else if (c1 == '@'|| c1 == 'B') {
5623 set_input_mode(JIS_X_0208);
5626 } else if (c1 == 'D'){
5627 set_input_mode(JIS_X_0212);
5629 #endif /* X0212_ENABLE */
5630 } else if (c1 == 'O' || c1 == 'Q'){
5631 set_input_mode(JIS_X_0213_1);
5633 } else if (c1 == 'P'){
5634 set_input_mode(JIS_X_0213_2);
5637 /* could be some special code */
5644 } else if (broken_f&0x2) {
5645 /* accept any ESC-(-x as broken code ... */
5646 input_mode = JIS_X_0208;
5655 } else if (c1 == '(') {
5657 if ((c1 = (*i_getc)(f)) == EOF) {
5658 /* don't send bogus code
5660 (*oconv)(0, '('); */
5663 else if (c1 == 'I') {
5664 /* JIS X 0201 Katakana */
5665 set_input_mode(JIS_X_0201_1976_K);
5668 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5669 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5670 set_input_mode(ASCII);
5673 else if (broken_f&0x2) {
5674 set_input_mode(ASCII);
5683 else if (c1 == '.') {
5685 if ((c1 = (*i_getc)(f)) == EOF) {
5688 else if (c1 == 'A') {
5699 else if (c1 == 'N') {
5702 if (g2 == ISO_8859_1) {
5717 } else if (c1 == ESC && iconv == s_iconv) {
5718 /* ESC in Shift_JIS */
5719 if ((c1 = (*i_getc)(f)) == EOF) {
5722 } else if (c1 == '$') {
5724 if ((c1 = (*i_getc)(f)) == EOF) {
5726 } else if (('E' <= c1 && c1 <= 'G') ||
5727 ('O' <= c1 && c1 <= 'Q')) {
5735 static const nkf_char jphone_emoji_first_table[7] =
5736 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5737 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5738 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5739 while (SP <= c1 && c1 <= 'z') {
5740 (*oconv)(0, c1 + c3);
5741 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5756 } else if (c1 == LF || c1 == CR) {
5758 input_mode = ASCII; set_iconv(FALSE, 0);
5760 } else if (mime_decode_f && !mime_decode_mode){
5762 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5770 } else { /* if (c1 == CR)*/
5771 if ((c1=(*i_getc)(f))!=EOF) {
5775 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5795 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5798 if ((c3 = (*i_getc)(f)) != EOF) {
5801 if ((c4 = (*i_getc)(f)) != EOF) {
5803 (*iconv)(c2, c1, c3|c4);
5808 /* 3 bytes EUC or UTF-8 */
5809 if ((c3 = (*i_getc)(f)) != EOF) {
5811 (*iconv)(c2, c1, c3);
5819 0x7F <= c2 && c2 <= 0x92 &&
5820 0x21 <= c1 && c1 <= 0x7E) {
5822 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5825 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5829 (*oconv)(PREFIX_EUCG3 | c2, c1);
5831 #endif /* X0212_ENABLE */
5833 (*oconv)(PREFIX_EUCG3 | c2, c1);
5836 (*oconv)(input_mode, c1); /* other special case */
5842 /* goto next_word */
5847 (*iconv)(EOF, 0, 0);
5848 if (!input_codename)
5851 struct input_code *p = input_code_list;
5852 struct input_code *result = p;
5854 if (p->score < result->score) result = p;
5857 set_input_codename(result->name);
5859 debug(result->name);
5867 * int options(unsigned char *cp)
5874 options(unsigned char *cp)
5878 unsigned char *cp_back = NULL;
5883 while(*cp && *cp++!='-');
5884 while (*cp || cp_back) {
5892 case '-': /* literal options */
5893 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5897 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5898 p = (unsigned char *)long_option[i].name;
5899 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5900 if (*p == cp[j] || cp[j] == SP){
5907 #if !defined(PERL_XS) && !defined(WIN32DLL)
5908 fprintf(stderr, "unknown long option: --%s\n", cp);
5912 while(*cp && *cp != SP && cp++);
5913 if (long_option[i].alias[0]){
5915 cp = (unsigned char *)long_option[i].alias;
5918 if (strcmp(long_option[i].name, "help") == 0){
5923 if (strcmp(long_option[i].name, "ic=") == 0){
5924 enc = nkf_enc_find((char *)p);
5926 input_encoding = enc;
5929 if (strcmp(long_option[i].name, "oc=") == 0){
5930 enc = nkf_enc_find((char *)p);
5931 /* if (enc <= 0) continue; */
5933 output_encoding = enc;
5936 if (strcmp(long_option[i].name, "guess=") == 0){
5937 if (p[0] == '0' || p[0] == '1') {
5945 if (strcmp(long_option[i].name, "overwrite") == 0){
5948 preserve_time_f = TRUE;
5951 if (strcmp(long_option[i].name, "overwrite=") == 0){
5954 preserve_time_f = TRUE;
5956 backup_suffix = (char *)p;
5959 if (strcmp(long_option[i].name, "in-place") == 0){
5962 preserve_time_f = FALSE;
5965 if (strcmp(long_option[i].name, "in-place=") == 0){
5968 preserve_time_f = FALSE;
5970 backup_suffix = (char *)p;
5975 if (strcmp(long_option[i].name, "cap-input") == 0){
5979 if (strcmp(long_option[i].name, "url-input") == 0){
5984 #ifdef NUMCHAR_OPTION
5985 if (strcmp(long_option[i].name, "numchar-input") == 0){
5991 if (strcmp(long_option[i].name, "no-output") == 0){
5995 if (strcmp(long_option[i].name, "debug") == 0){
6000 if (strcmp(long_option[i].name, "cp932") == 0){
6001 #ifdef SHIFTJIS_CP932
6005 #ifdef UTF8_OUTPUT_ENABLE
6006 ms_ucs_map_f = UCS_MAP_CP932;
6010 if (strcmp(long_option[i].name, "no-cp932") == 0){
6011 #ifdef SHIFTJIS_CP932
6015 #ifdef UTF8_OUTPUT_ENABLE
6016 ms_ucs_map_f = UCS_MAP_ASCII;
6020 #ifdef SHIFTJIS_CP932
6021 if (strcmp(long_option[i].name, "cp932inv") == 0){
6028 if (strcmp(long_option[i].name, "x0212") == 0){
6035 if (strcmp(long_option[i].name, "exec-in") == 0){
6039 if (strcmp(long_option[i].name, "exec-out") == 0){
6044 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
6045 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
6046 no_cp932ext_f = TRUE;
6049 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
6050 no_best_fit_chars_f = TRUE;
6053 if (strcmp(long_option[i].name, "fb-skip") == 0){
6054 encode_fallback = NULL;
6057 if (strcmp(long_option[i].name, "fb-html") == 0){
6058 encode_fallback = encode_fallback_html;
6061 if (strcmp(long_option[i].name, "fb-xml") == 0){
6062 encode_fallback = encode_fallback_xml;
6065 if (strcmp(long_option[i].name, "fb-java") == 0){
6066 encode_fallback = encode_fallback_java;
6069 if (strcmp(long_option[i].name, "fb-perl") == 0){
6070 encode_fallback = encode_fallback_perl;
6073 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6074 encode_fallback = encode_fallback_subchar;
6077 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6078 encode_fallback = encode_fallback_subchar;
6079 unicode_subchar = 0;
6081 /* decimal number */
6082 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6083 unicode_subchar *= 10;
6084 unicode_subchar += hex2bin(p[i]);
6086 }else if(p[1] == 'x' || p[1] == 'X'){
6087 /* hexadecimal number */
6088 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6089 unicode_subchar <<= 4;
6090 unicode_subchar |= hex2bin(p[i]);
6094 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6095 unicode_subchar *= 8;
6096 unicode_subchar += hex2bin(p[i]);
6099 w16e_conv(unicode_subchar, &i, &j);
6100 unicode_subchar = i<<8 | j;
6104 #ifdef UTF8_OUTPUT_ENABLE
6105 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6106 ms_ucs_map_f = UCS_MAP_MS;
6110 #ifdef UNICODE_NORMALIZATION
6111 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6116 if (strcmp(long_option[i].name, "prefix=") == 0){
6117 if (nkf_isgraph(p[0])){
6118 for (i = 1; nkf_isgraph(p[i]); i++){
6119 prefix_table[p[i]] = p[0];
6124 #if !defined(PERL_XS) && !defined(WIN32DLL)
6125 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6130 case 'b': /* buffered mode */
6133 case 'u': /* non bufferd mode */
6136 case 't': /* transparent mode */
6141 } else if (*cp=='2') {
6145 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6153 case 'j': /* JIS output */
6155 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6157 case 'e': /* AT&T EUC output */
6158 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6160 case 's': /* SJIS output */
6161 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6163 case 'l': /* ISO8859 Latin-1 support, no conversion */
6164 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6165 input_encoding = nkf_enc_from_index(ISO_8859_1);
6167 case 'i': /* Kanji IN ESC-$-@/B */
6168 if (*cp=='@'||*cp=='B')
6169 kanji_intro = *cp++;
6171 case 'o': /* ASCII IN ESC-(-J/B/H */
6172 /* ESC ( H was used in initial JUNET messages */
6173 if (*cp=='J'||*cp=='B'||*cp=='H')
6174 ascii_intro = *cp++;
6178 bit:1 katakana->hiragana
6179 bit:2 hiragana->katakana
6181 if ('9'>= *cp && *cp>='0')
6182 hira_f |= (*cp++ -'0');
6189 #if defined(MSDOS) || defined(__OS2__)
6196 show_configuration();
6204 #ifdef UTF8_OUTPUT_ENABLE
6205 case 'w': /* UTF-8 output */
6210 output_encoding = nkf_enc_from_index(UTF_8N);
6212 output_bom_f = TRUE;
6213 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6217 if ('1'== cp[0] && '6'==cp[1]) {
6220 } else if ('3'== cp[0] && '2'==cp[1]) {
6224 output_encoding = nkf_enc_from_index(UTF_8);
6229 output_endian = ENDIAN_LITTLE;
6230 } else if (cp[0] == 'B') {
6235 enc_idx = enc_idx == UTF_16
6236 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6237 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6239 output_bom_f = TRUE;
6240 enc_idx = enc_idx == UTF_16
6241 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6242 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6244 output_encoding = nkf_enc_from_index(enc_idx);
6248 #ifdef UTF8_INPUT_ENABLE
6249 case 'W': /* UTF input */
6252 input_encoding = nkf_enc_from_index(UTF_8);
6255 if ('1'== cp[0] && '6'==cp[1]) {
6257 input_endian = ENDIAN_BIG;
6259 } else if ('3'== cp[0] && '2'==cp[1]) {
6261 input_endian = ENDIAN_BIG;
6264 input_encoding = nkf_enc_from_index(UTF_8);
6269 input_endian = ENDIAN_LITTLE;
6270 } else if (cp[0] == 'B') {
6272 input_endian = ENDIAN_BIG;
6274 enc_idx = (enc_idx == UTF_16
6275 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6276 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6277 input_encoding = nkf_enc_from_index(enc_idx);
6281 /* Input code assumption */
6282 case 'J': /* ISO-2022-JP input */
6283 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6285 case 'E': /* EUC-JP input */
6286 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6288 case 'S': /* Shift_JIS input */
6289 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6291 case 'Z': /* Convert X0208 alphabet to asii */
6293 bit:0 Convert JIS X 0208 Alphabet to ASCII
6294 bit:1 Convert Kankaku to one space
6295 bit:2 Convert Kankaku to two spaces
6296 bit:3 Convert HTML Entity
6297 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6299 while ('0'<= *cp && *cp <='4') {
6300 alpha_f |= 1 << (*cp++ - '0');
6304 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6305 x0201_f = FALSE; /* No X0201->X0208 conversion */
6307 ESC-(-I in JIS, EUC, MS Kanji
6308 SI/SO in JIS, EUC, MS Kanji
6309 SS2 in EUC, JIS, not in MS Kanji
6310 MS Kanji (0xa0-0xdf)
6312 ESC-(-I in JIS (0x20-0x5f)
6313 SS2 in EUC (0xa0-0xdf)
6314 0xa0-0xd in MS Kanji (0xa0-0xdf)
6317 case 'X': /* Convert X0201 kana to X0208 */
6320 case 'F': /* prserve new lines */
6321 fold_preserve_f = TRUE;
6322 case 'f': /* folding -f60 or -f */
6325 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6327 fold_len += *cp++ - '0';
6329 if (!(0<fold_len && fold_len<BUFSIZ))
6330 fold_len = DEFAULT_FOLD;
6334 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6336 fold_margin += *cp++ - '0';
6340 case 'm': /* MIME support */
6341 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6342 if (*cp=='B'||*cp=='Q') {
6343 mime_decode_mode = *cp++;
6344 mimebuf_f = FIXED_MIME;
6345 } else if (*cp=='N') {
6346 mime_f = TRUE; cp++;
6347 } else if (*cp=='S') {
6348 mime_f = STRICT_MIME; cp++;
6349 } else if (*cp=='0') {
6350 mime_decode_f = FALSE;
6351 mime_f = FALSE; cp++;
6353 mime_f = STRICT_MIME;
6356 case 'M': /* MIME output */
6359 mimeout_f = FIXED_MIME; cp++;
6360 } else if (*cp=='Q') {
6362 mimeout_f = FIXED_MIME; cp++;
6367 case 'B': /* Broken JIS support */
6369 bit:1 allow any x on ESC-(-x or ESC-$-x
6370 bit:2 reset to ascii on NL
6372 if ('9'>= *cp && *cp>='0')
6373 broken_f |= 1<<(*cp++ -'0');
6378 case 'O':/* for Output file */
6382 case 'c':/* add cr code */
6385 case 'd':/* delete cr code */
6388 case 'I': /* ISO-2022-JP output */
6391 case 'L': /* line mode */
6392 if (*cp=='u') { /* unix */
6393 eolmode_f = LF; cp++;
6394 } else if (*cp=='m') { /* mac */
6395 eolmode_f = CR; cp++;
6396 } else if (*cp=='w') { /* windows */
6397 eolmode_f = CRLF; cp++;
6398 } else if (*cp=='0') { /* no conversion */
6399 eolmode_f = 0; cp++;
6404 if ('2' <= *cp && *cp <= '9') {
6407 } else if (*cp == '0' || *cp == '1') {
6416 /* module muliple options in a string are allowed for Perl moudle */
6417 while(*cp && *cp++!='-');
6420 #if !defined(PERL_XS) && !defined(WIN32DLL)
6421 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6423 /* bogus option but ignored */
6431 #include "nkf32dll.c"
6432 #elif defined(PERL_XS)
6433 #else /* WIN32DLL */
6435 main(int argc, char **argv)
6440 char *outfname = NULL;
6443 #ifdef EASYWIN /*Easy Win */
6444 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6446 #ifdef DEFAULT_CODE_LOCALE
6447 setlocale(LC_CTYPE, "");
6451 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6452 cp = (unsigned char *)*argv;
6457 if (pipe(fds) < 0 || (pid = fork()) < 0){
6468 execvp(argv[1], &argv[1]);
6485 int debug_f_back = debug_f;
6488 int exec_f_back = exec_f;
6491 int x0212_f_back = x0212_f;
6493 int x0213_f_back = x0213_f;
6494 int guess_f_back = guess_f;
6496 guess_f = guess_f_back;
6499 debug_f = debug_f_back;
6502 exec_f = exec_f_back;
6504 x0212_f = x0212_f_back;
6505 x0213_f = x0213_f_back;
6508 if (binmode_f == TRUE)
6509 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6510 if (freopen("","wb",stdout) == NULL)
6517 setbuf(stdout, (char *) NULL);
6519 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6522 if (binmode_f == TRUE)
6523 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6524 if (freopen("","rb",stdin) == NULL) return (-1);
6528 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6532 kanji_convert(stdin);
6533 if (guess_f) print_guessed_code(NULL);
6537 int is_argument_error = FALSE;
6539 input_codename = NULL;
6542 iconv_for_check = 0;
6544 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6546 is_argument_error = TRUE;
6554 /* reopen file for stdout */
6555 if (file_out_f == TRUE) {
6558 outfname = nkf_xmalloc(strlen(origfname)
6559 + strlen(".nkftmpXXXXXX")
6561 strcpy(outfname, origfname);
6565 for (i = strlen(outfname); i; --i){
6566 if (outfname[i - 1] == '/'
6567 || outfname[i - 1] == '\\'){
6573 strcat(outfname, "ntXXXXXX");
6575 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6576 S_IREAD | S_IWRITE);
6578 strcat(outfname, ".nkftmpXXXXXX");
6579 fd = mkstemp(outfname);
6582 || (fd_backup = dup(fileno(stdout))) < 0
6583 || dup2(fd, fileno(stdout)) < 0
6594 outfname = "nkf.out";
6597 if(freopen(outfname, "w", stdout) == NULL) {
6601 if (binmode_f == TRUE) {
6602 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6603 if (freopen("","wb",stdout) == NULL)
6610 if (binmode_f == TRUE)
6611 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6612 if (freopen("","rb",fin) == NULL)
6617 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6621 char *filename = NULL;
6623 if (nfiles > 1) filename = origfname;
6624 if (guess_f) print_guessed_code(filename);
6630 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6638 if (dup2(fd_backup, fileno(stdout)) < 0){
6641 if (stat(origfname, &sb)) {
6642 fprintf(stderr, "Can't stat %s\n", origfname);
6644 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6645 if (chmod(outfname, sb.st_mode)) {
6646 fprintf(stderr, "Can't set permission %s\n", outfname);
6649 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6650 if(preserve_time_f){
6651 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6652 tb[0] = tb[1] = sb.st_mtime;
6653 if (utime(outfname, tb)) {
6654 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6657 tb.actime = sb.st_atime;
6658 tb.modtime = sb.st_mtime;
6659 if (utime(outfname, &tb)) {
6660 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6665 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6667 unlink(backup_filename);
6669 if (rename(origfname, backup_filename)) {
6670 perror(backup_filename);
6671 fprintf(stderr, "Can't rename %s to %s\n",
6672 origfname, backup_filename);
6674 nkf_xfree(backup_filename);
6677 if (unlink(origfname)){
6682 if (rename(outfname, origfname)) {
6684 fprintf(stderr, "Can't rename %s to %s\n",
6685 outfname, origfname);
6687 nkf_xfree(outfname);
6692 if (is_argument_error)
6695 #ifdef EASYWIN /*Easy Win */
6696 if (file_out_f == FALSE)
6697 scanf("%d",&end_check);
6700 #else /* for Other OS */
6701 if (file_out_f == TRUE)
6703 #endif /*Easy Win */
6706 #endif /* WIN32DLL */