2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2009, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.1.1"
24 #define NKF_RELEASE_DATE "2010-01-05"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2010, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
215 {"ISO-2022-JP", ISO_2022_JP},
216 {"ISO2022JP-CP932", CP50220},
217 {"CP50220", CP50220},
218 {"CP50221", CP50221},
219 {"CSISO2022JP", CP50221},
220 {"CP50222", CP50222},
221 {"ISO-2022-JP-1", ISO_2022_JP_1},
222 {"ISO-2022-JP-3", ISO_2022_JP_3},
223 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
224 {"SHIFT_JIS", SHIFT_JIS},
227 {"WINDOWS-31J", WINDOWS_31J},
228 {"CSWINDOWS31J", WINDOWS_31J},
229 {"CP932", WINDOWS_31J},
230 {"MS932", WINDOWS_31J},
231 {"CP10001", CP10001},
234 {"EUCJP-NKF", EUCJP_NKF},
235 {"CP51932", CP51932},
236 {"EUC-JP-MS", EUCJP_MS},
237 {"EUCJP-MS", EUCJP_MS},
238 {"EUCJPMS", EUCJP_MS},
239 {"EUC-JP-ASCII", EUCJP_ASCII},
240 {"EUCJP-ASCII", EUCJP_ASCII},
241 {"SHIFT_JISX0213", SHIFT_JISX0213},
242 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
243 {"EUC-JISX0213", EUC_JISX0213},
244 {"EUC-JIS-2004", EUC_JIS_2004},
247 {"UTF-8-BOM", UTF_8_BOM},
248 {"UTF8-MAC", UTF8_MAC},
249 {"UTF-8-MAC", UTF8_MAC},
251 {"UTF-16BE", UTF_16BE},
252 {"UTF-16BE-BOM", UTF_16BE_BOM},
253 {"UTF-16LE", UTF_16LE},
254 {"UTF-16LE-BOM", UTF_16LE_BOM},
256 {"UTF-32BE", UTF_32BE},
257 {"UTF-32BE-BOM", UTF_32BE_BOM},
258 {"UTF-32LE", UTF_32LE},
259 {"UTF-32LE-BOM", UTF_32LE_BOM},
264 #if defined(DEFAULT_CODE_JIS)
265 #define DEFAULT_ENCIDX ISO_2022_JP
266 #elif defined(DEFAULT_CODE_SJIS)
267 #define DEFAULT_ENCIDX SHIFT_JIS
268 #elif defined(DEFAULT_CODE_WINDOWS_31J)
269 #define DEFAULT_ENCIDX WINDOWS_31J
270 #elif defined(DEFAULT_CODE_EUC)
271 #define DEFAULT_ENCIDX EUC_JP
272 #elif defined(DEFAULT_CODE_UTF8)
273 #define DEFAULT_ENCIDX UTF_8
277 #define is_alnum(c) \
278 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
280 /* I don't trust portablity of toupper */
281 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
282 #define nkf_isoctal(c) ('0'<=c && c<='7')
283 #define nkf_isdigit(c) ('0'<=c && c<='9')
284 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
285 #define nkf_isblank(c) (c == SP || c == TAB)
286 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
287 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
288 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
289 #define nkf_isprint(c) (SP<=c && c<='~')
290 #define nkf_isgraph(c) ('!'<=c && c<='~')
291 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
292 ('A'<=c&&c<='F') ? (c-'A'+10) : \
293 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
294 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
295 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
296 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
297 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
298 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
300 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
301 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
303 #define HOLD_SIZE 1024
304 #if defined(INT_IS_SHORT)
305 #define IOBUF_SIZE 2048
307 #define IOBUF_SIZE 16384
310 #define DEFAULT_J 'B'
311 #define DEFAULT_R 'B'
318 /* MIME preprocessor */
320 #ifdef EASYWIN /*Easy Win */
321 extern POINT _BufferSize;
330 void (*status_func)(struct input_code *, nkf_char);
331 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
335 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
336 static nkf_encoding *input_encoding = NULL;
337 static nkf_encoding *output_encoding = NULL;
339 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
341 * 0: Shift_JIS, eucJP-ascii
346 #define UCS_MAP_ASCII 0
348 #define UCS_MAP_CP932 2
349 #define UCS_MAP_CP10001 3
350 static int ms_ucs_map_f = UCS_MAP_ASCII;
352 #ifdef UTF8_INPUT_ENABLE
353 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
354 static int no_cp932ext_f = FALSE;
355 /* ignore ZERO WIDTH NO-BREAK SPACE */
356 static int no_best_fit_chars_f = FALSE;
357 static int input_endian = ENDIAN_BIG;
358 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
359 static void (*encode_fallback)(nkf_char c) = NULL;
360 static void w_status(struct input_code *, nkf_char);
362 #ifdef UTF8_OUTPUT_ENABLE
363 static int output_bom_f = FALSE;
364 static int output_endian = ENDIAN_BIG;
367 static void std_putc(nkf_char c);
368 static nkf_char std_getc(FILE *f);
369 static nkf_char std_ungetc(nkf_char c,FILE *f);
371 static nkf_char broken_getc(FILE *f);
372 static nkf_char broken_ungetc(nkf_char c,FILE *f);
374 static nkf_char mime_getc(FILE *f);
376 static void mime_putc(nkf_char c);
380 #if !defined(PERL_XS) && !defined(WIN32DLL)
381 static unsigned char stdibuf[IOBUF_SIZE];
382 static unsigned char stdobuf[IOBUF_SIZE];
386 static int unbuf_f = FALSE;
387 static int estab_f = FALSE;
388 static int nop_f = FALSE;
389 static int binmode_f = TRUE; /* binary mode */
390 static int rot_f = FALSE; /* rot14/43 mode */
391 static int hira_f = FALSE; /* hira/kata henkan */
392 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
393 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
394 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
395 static int mimebuf_f = FALSE; /* MIME buffered input */
396 static int broken_f = FALSE; /* convert ESC-less broken JIS */
397 static int iso8859_f = FALSE; /* ISO8859 through */
398 static int mimeout_f = FALSE; /* base64 mode */
399 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
400 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
402 #ifdef UNICODE_NORMALIZATION
403 static int nfc_f = FALSE;
404 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
405 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
409 static int cap_f = FALSE;
410 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
411 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
413 static int url_f = FALSE;
414 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
415 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
418 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
419 #define CLASS_MASK NKF_INT32_C(0xFF000000)
420 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
421 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
422 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
423 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
424 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
425 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
426 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
427 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
428 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
430 #ifdef NUMCHAR_OPTION
431 static int numchar_f = FALSE;
432 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
433 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
437 static int noout_f = FALSE;
438 static void no_putc(nkf_char c);
439 static int debug_f = FALSE;
440 static void debug(const char *str);
441 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
444 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
445 static void set_input_codename(const char *codename);
448 static int exec_f = 0;
451 #ifdef SHIFTJIS_CP932
452 /* invert IBM extended characters to others */
453 static int cp51932_f = FALSE;
455 /* invert NEC-selected IBM extended characters to IBM extended characters */
456 static int cp932inv_f = TRUE;
458 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
459 #endif /* SHIFTJIS_CP932 */
461 static int x0212_f = FALSE;
462 static int x0213_f = FALSE;
464 static unsigned char prefix_table[256];
466 static void e_status(struct input_code *, nkf_char);
467 static void s_status(struct input_code *, nkf_char);
469 struct input_code input_code_list[] = {
470 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
471 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
472 #ifdef UTF8_INPUT_ENABLE
473 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
474 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
475 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
480 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
481 static int base64_count = 0;
483 /* X0208 -> ASCII converter */
486 static int f_line = 0; /* chars in line */
487 static int f_prev = 0;
488 static int fold_preserve_f = FALSE; /* preserve new lines */
489 static int fold_f = FALSE;
490 static int fold_len = 0;
493 static unsigned char kanji_intro = DEFAULT_J;
494 static unsigned char ascii_intro = DEFAULT_R;
498 #define FOLD_MARGIN 10
499 #define DEFAULT_FOLD 60
501 static int fold_margin = FOLD_MARGIN;
503 /* process default */
506 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
508 fprintf(stderr,"nkf internal module connection failure.\n");
514 no_connection(nkf_char c2, nkf_char c1)
516 no_connection2(c2,c1,0);
519 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
520 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
522 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
523 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
524 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
525 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
526 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
527 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
528 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
530 /* static redirections */
532 static void (*o_putc)(nkf_char c) = std_putc;
534 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
535 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
537 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
538 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
540 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
542 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
543 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
545 /* for strict mime */
546 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
547 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
550 static int output_mode = ASCII; /* output kanji mode */
551 static int input_mode = ASCII; /* input kanji mode */
552 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
554 /* X0201 / X0208 conversion tables */
556 /* X0201 kana conversion table */
558 static const unsigned char cv[]= {
559 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
560 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
561 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
562 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
563 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
564 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
565 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
566 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
567 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
568 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
569 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
570 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
571 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
572 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
573 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
574 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
578 /* X0201 kana conversion table for daguten */
580 static const unsigned char dv[]= {
581 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
582 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
583 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
584 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
585 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
586 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
587 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
588 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
589 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
590 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
592 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
593 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
594 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 /* X0201 kana conversion table for han-daguten */
601 static const unsigned char ev[]= {
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
613 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
621 /* X0208 kigou conversion table */
622 /* 0x8140 - 0x819e */
623 static const unsigned char fv[] = {
625 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
626 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
627 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
628 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
629 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
630 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
631 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
632 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
633 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
635 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
641 static int option_mode = 0;
642 static int file_out_f = FALSE;
644 static int overwrite_f = FALSE;
645 static int preserve_time_f = FALSE;
646 static int backup_f = FALSE;
647 static char *backup_suffix = "";
650 static int eolmode_f = 0; /* CR, LF, CRLF */
651 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
652 static nkf_char prev_cr = 0; /* CR or 0 */
653 #ifdef EASYWIN /*Easy Win */
654 static int end_check;
658 nkf_xmalloc(size_t size)
662 if (size == 0) size = 1;
666 perror("can't malloc");
674 nkf_xrealloc(void *ptr, size_t size)
676 if (size == 0) size = 1;
678 ptr = realloc(ptr, size);
680 perror("can't realloc");
687 #define nkf_xfree(ptr) free(ptr)
690 nkf_str_caseeql(const char *src, const char *target)
693 for (i = 0; src[i] && target[i]; i++) {
694 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
696 if (src[i] || target[i]) return FALSE;
701 nkf_enc_from_index(int idx)
703 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
706 return &nkf_encoding_table[idx];
710 nkf_enc_find_index(const char *name)
713 if (name[0] == 'X' && *(name+1) == '-') name += 2;
714 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
715 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
716 return encoding_name_to_id_table[i].id;
723 nkf_enc_find(const char *name)
726 idx = nkf_enc_find_index(name);
727 if (idx < 0) return 0;
728 return nkf_enc_from_index(idx);
731 #define nkf_enc_name(enc) (enc)->name
732 #define nkf_enc_to_index(enc) (enc)->id
733 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
734 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
735 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
736 #define nkf_enc_asciicompat(enc) (\
737 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
738 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
739 #define nkf_enc_unicode_p(enc) (\
740 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
741 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
742 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
743 #define nkf_enc_cp5022x_p(enc) (\
744 nkf_enc_to_index(enc) == CP50220 ||\
745 nkf_enc_to_index(enc) == CP50221 ||\
746 nkf_enc_to_index(enc) == CP50222)
748 #ifdef DEFAULT_CODE_LOCALE
752 #ifdef HAVE_LANGINFO_H
753 return nl_langinfo(CODESET);
754 #elif defined(__WIN32__)
756 sprintf(buf, "CP%d", GetACP());
758 #elif defined(__OS2__)
759 # if defined(INT_IS_SHORT)
765 ULONG ulCP[1], ulncp;
766 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
767 if (ulCP[0] == 932 || ulCP[0] == 943)
768 strcpy(buf, "Shift_JIS");
770 sprintf(buf, "CP%lu", ulCP[0]);
778 nkf_locale_encoding()
780 nkf_encoding *enc = 0;
781 const char *encname = nkf_locale_charmap();
783 enc = nkf_enc_find(encname);
786 #endif /* DEFAULT_CODE_LOCALE */
791 return &nkf_encoding_table[UTF_8];
795 nkf_default_encoding()
797 nkf_encoding *enc = 0;
798 #ifdef DEFAULT_CODE_LOCALE
799 enc = nkf_locale_encoding();
800 #elif defined(DEFAULT_ENCIDX)
801 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
803 if (!enc) enc = nkf_utf8_encoding();
814 nkf_buf_new(int length)
816 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
817 buf->ptr = nkf_xmalloc(length);
825 nkf_buf_dispose(nkf_buf_t *buf)
832 #define nkf_buf_length(buf) ((buf)->len)
833 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
836 nkf_buf_at(nkf_buf_t *buf, int index)
838 assert(index <= buf->len);
839 return buf->ptr[index];
843 nkf_buf_clear(nkf_buf_t *buf)
849 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
851 if (buf->capa <= buf->len) {
854 buf->ptr[buf->len++] = c;
858 nkf_buf_pop(nkf_buf_t *buf)
860 assert(!nkf_buf_empty_p(buf));
861 return buf->ptr[--buf->len];
864 /* Normalization Form C */
867 #define fprintf dllprintf
873 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
880 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
881 #ifdef UTF8_OUTPUT_ENABLE
882 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
883 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
886 #ifdef UTF8_INPUT_ENABLE
887 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
888 " UTF option is -W[8,[16,32][B,L]]\n"
890 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
894 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
895 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
896 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
899 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
900 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
901 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
902 " X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
905 " O Output to File (DEFAULT 'nkf.out')\n"
906 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
909 " --ic=<encoding> Specify the input encoding\n"
910 " --oc=<encoding> Specify the output encoding\n"
911 " --hiragana --katakana Hiragana/Katakana Conversion\n"
912 " --katakana-hiragana Converts each other\n"
916 " --{cap, url}-input Convert hex after ':' or '%%'\n"
918 #ifdef NUMCHAR_OPTION
919 " --numchar-input Convert Unicode Character Reference\n"
921 #ifdef UTF8_INPUT_ENABLE
922 " --fb-{skip, html, xml, perl, java, subchar}\n"
923 " Specify unassigned character's replacement\n"
928 " --in-place[=SUF] Overwrite original files\n"
929 " --overwrite[=SUF] Preserve timestamp of original files\n"
931 " -g --guess Guess the input code\n"
932 " -v --version Print the version\n"
933 " --help/-V Print this help / configuration\n"
939 show_configuration(void)
942 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
943 " Compile-time options:\n"
944 " Compiled at: " __DATE__ " " __TIME__ "\n"
947 " Default output encoding: "
948 #ifdef DEFAULT_CODE_LOCALE
949 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
950 #elif defined(DEFAULT_ENCIDX)
951 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
957 " Default output end of line: "
958 #if DEFAULT_NEWLINE == CR
960 #elif DEFAULT_NEWLINE == CRLF
966 " Decode MIME encoded string: "
967 #if MIME_DECODE_DEFAULT
973 " Convert JIS X 0201 Katakana: "
980 " --help, --version output: "
981 #if HELP_OUTPUT_HELP_OUTPUT
992 get_backup_filename(const char *suffix, const char *filename)
994 char *backup_filename;
995 int asterisk_count = 0;
997 int filename_length = strlen(filename);
999 for(i = 0; suffix[i]; i++){
1000 if(suffix[i] == '*') asterisk_count++;
1004 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1005 for(i = 0, j = 0; suffix[i];){
1006 if(suffix[i] == '*'){
1007 backup_filename[j] = '\0';
1008 strncat(backup_filename, filename, filename_length);
1010 j += filename_length;
1012 backup_filename[j++] = suffix[i++];
1015 backup_filename[j] = '\0';
1017 j = filename_length + strlen(suffix);
1018 backup_filename = nkf_xmalloc(j + 1);
1019 strcpy(backup_filename, filename);
1020 strcat(backup_filename, suffix);
1021 backup_filename[j] = '\0';
1023 return backup_filename;
1027 #ifdef UTF8_INPUT_ENABLE
1029 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1034 if(c >= NKF_INT32_C(1)<<shift){
1036 (*f)(0, bin2hex(c>>shift));
1047 encode_fallback_html(nkf_char c)
1052 if(c >= NKF_INT32_C(1000000))
1053 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1054 if(c >= NKF_INT32_C(100000))
1055 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1057 (*oconv)(0, 0x30+(c/10000 )%10);
1059 (*oconv)(0, 0x30+(c/1000 )%10);
1061 (*oconv)(0, 0x30+(c/100 )%10);
1063 (*oconv)(0, 0x30+(c/10 )%10);
1065 (*oconv)(0, 0x30+ c %10);
1071 encode_fallback_xml(nkf_char c)
1076 nkf_each_char_to_hex(oconv, c);
1082 encode_fallback_java(nkf_char c)
1086 if(!nkf_char_unicode_bmp_p(c)){
1090 (*oconv)(0, bin2hex(c>>20));
1091 (*oconv)(0, bin2hex(c>>16));
1095 (*oconv)(0, bin2hex(c>>12));
1096 (*oconv)(0, bin2hex(c>> 8));
1097 (*oconv)(0, bin2hex(c>> 4));
1098 (*oconv)(0, bin2hex(c ));
1103 encode_fallback_perl(nkf_char c)
1108 nkf_each_char_to_hex(oconv, c);
1114 encode_fallback_subchar(nkf_char c)
1116 c = unicode_subchar;
1117 (*oconv)((c>>8)&0xFF, c&0xFF);
1122 static const struct {
1146 {"katakana-hiragana","h3"},
1154 #ifdef UTF8_OUTPUT_ENABLE
1164 {"fb-subchar=", ""},
1166 #ifdef UTF8_INPUT_ENABLE
1167 {"utf8-input", "W"},
1168 {"utf16-input", "W16"},
1169 {"no-cp932ext", ""},
1170 {"no-best-fit-chars",""},
1172 #ifdef UNICODE_NORMALIZATION
1173 {"utf8mac-input", ""},
1185 #ifdef NUMCHAR_OPTION
1186 {"numchar-input", ""},
1192 #ifdef SHIFTJIS_CP932
1203 set_input_encoding(nkf_encoding *enc)
1205 switch (nkf_enc_to_index(enc)) {
1213 #ifdef SHIFTJIS_CP932
1216 #ifdef UTF8_OUTPUT_ENABLE
1217 ms_ucs_map_f = UCS_MAP_CP932;
1227 case ISO_2022_JP_2004:
1235 #ifdef SHIFTJIS_CP932
1238 #ifdef UTF8_OUTPUT_ENABLE
1239 ms_ucs_map_f = UCS_MAP_CP932;
1244 #ifdef SHIFTJIS_CP932
1247 #ifdef UTF8_OUTPUT_ENABLE
1248 ms_ucs_map_f = UCS_MAP_CP10001;
1257 #ifdef SHIFTJIS_CP932
1260 #ifdef UTF8_OUTPUT_ENABLE
1261 ms_ucs_map_f = UCS_MAP_CP932;
1265 #ifdef SHIFTJIS_CP932
1268 #ifdef UTF8_OUTPUT_ENABLE
1269 ms_ucs_map_f = UCS_MAP_MS;
1273 #ifdef SHIFTJIS_CP932
1276 #ifdef UTF8_OUTPUT_ENABLE
1277 ms_ucs_map_f = UCS_MAP_ASCII;
1280 case SHIFT_JISX0213:
1281 case SHIFT_JIS_2004:
1283 #ifdef SHIFTJIS_CP932
1290 #ifdef SHIFTJIS_CP932
1294 #ifdef UTF8_INPUT_ENABLE
1295 #ifdef UNICODE_NORMALIZATION
1303 input_endian = ENDIAN_BIG;
1307 input_endian = ENDIAN_LITTLE;
1312 input_endian = ENDIAN_BIG;
1316 input_endian = ENDIAN_LITTLE;
1323 set_output_encoding(nkf_encoding *enc)
1325 switch (nkf_enc_to_index(enc)) {
1328 #ifdef SHIFTJIS_CP932
1329 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1331 #ifdef UTF8_OUTPUT_ENABLE
1332 ms_ucs_map_f = UCS_MAP_CP932;
1337 #ifdef SHIFTJIS_CP932
1338 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1340 #ifdef UTF8_OUTPUT_ENABLE
1341 ms_ucs_map_f = UCS_MAP_CP932;
1345 #ifdef SHIFTJIS_CP932
1346 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1351 #ifdef SHIFTJIS_CP932
1352 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1358 #ifdef SHIFTJIS_CP932
1359 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1366 #ifdef UTF8_OUTPUT_ENABLE
1367 ms_ucs_map_f = UCS_MAP_CP932;
1371 #ifdef UTF8_OUTPUT_ENABLE
1372 ms_ucs_map_f = UCS_MAP_CP10001;
1377 #ifdef SHIFTJIS_CP932
1378 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1380 #ifdef UTF8_OUTPUT_ENABLE
1381 ms_ucs_map_f = UCS_MAP_ASCII;
1386 #ifdef SHIFTJIS_CP932
1387 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1389 #ifdef UTF8_OUTPUT_ENABLE
1390 ms_ucs_map_f = UCS_MAP_ASCII;
1395 #ifdef SHIFTJIS_CP932
1396 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1398 #ifdef UTF8_OUTPUT_ENABLE
1399 ms_ucs_map_f = UCS_MAP_CP932;
1404 #ifdef UTF8_OUTPUT_ENABLE
1405 ms_ucs_map_f = UCS_MAP_MS;
1410 #ifdef UTF8_OUTPUT_ENABLE
1411 ms_ucs_map_f = UCS_MAP_ASCII;
1414 case SHIFT_JISX0213:
1415 case SHIFT_JIS_2004:
1417 #ifdef SHIFTJIS_CP932
1418 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1425 #ifdef SHIFTJIS_CP932
1426 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1429 #ifdef UTF8_OUTPUT_ENABLE
1431 output_bom_f = TRUE;
1435 output_bom_f = TRUE;
1438 output_endian = ENDIAN_LITTLE;
1439 output_bom_f = FALSE;
1442 output_endian = ENDIAN_LITTLE;
1443 output_bom_f = TRUE;
1447 output_bom_f = TRUE;
1450 output_endian = ENDIAN_LITTLE;
1451 output_bom_f = FALSE;
1454 output_endian = ENDIAN_LITTLE;
1455 output_bom_f = TRUE;
1461 static struct input_code*
1462 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1465 struct input_code *p = input_code_list;
1467 if (iconv_func == p->iconv_func){
1477 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1479 #ifdef INPUT_CODE_FIX
1480 if (f || !input_encoding)
1487 #ifdef INPUT_CODE_FIX
1488 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1494 if (estab_f && iconv_for_check != iconv){
1495 struct input_code *p = find_inputcode_byfunc(iconv);
1497 set_input_codename(p->name);
1500 iconv_for_check = iconv;
1507 x0212_shift(nkf_char c)
1512 if (0x75 <= c && c <= 0x7f){
1513 ret = c + (0x109 - 0x75);
1516 if (0x75 <= c && c <= 0x7f){
1517 ret = c + (0x113 - 0x75);
1525 x0212_unshift(nkf_char c)
1528 if (0x7f <= c && c <= 0x88){
1529 ret = c + (0x75 - 0x7f);
1530 }else if (0x89 <= c && c <= 0x92){
1531 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1535 #endif /* X0212_ENABLE */
1538 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1544 if((0x21 <= ndx && ndx <= 0x2F)){
1545 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1546 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1548 }else if(0x6E <= ndx && ndx <= 0x7E){
1549 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1550 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1556 else if(nkf_isgraph(ndx)){
1558 const unsigned short *ptr;
1559 ptr = x0212_shiftjis[ndx - 0x21];
1561 val = ptr[(c1 & 0x7f) - 0x21];
1570 c2 = x0212_shift(c2);
1572 #endif /* X0212_ENABLE */
1574 if(0x7F < c2) return 1;
1575 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1576 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1581 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1583 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1586 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1587 if (0xFC < c1) return 1;
1588 #ifdef SHIFTJIS_CP932
1589 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1590 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1597 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1598 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1604 #endif /* SHIFTJIS_CP932 */
1606 if (!x0213_f && is_ibmext_in_sjis(c2)){
1607 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1610 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1623 if(x0213_f && c2 >= 0xF0){
1624 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1625 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1626 }else{ /* 78<=k<=94 */
1627 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1628 if (0x9E < c1) c2++;
1631 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1632 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1633 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1634 if (0x9E < c1) c2++;
1637 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1644 c2 = x0212_unshift(c2);
1651 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1653 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1661 }else if (val < 0x800){
1662 *p1 = 0xc0 | (val >> 6);
1663 *p2 = 0x80 | (val & 0x3f);
1666 } else if (nkf_char_unicode_bmp_p(val)) {
1667 *p1 = 0xe0 | (val >> 12);
1668 *p2 = 0x80 | ((val >> 6) & 0x3f);
1669 *p3 = 0x80 | ( val & 0x3f);
1671 } else if (nkf_char_unicode_value_p(val)) {
1672 *p1 = 0xf0 | (val >> 18);
1673 *p2 = 0x80 | ((val >> 12) & 0x3f);
1674 *p3 = 0x80 | ((val >> 6) & 0x3f);
1675 *p4 = 0x80 | ( val & 0x3f);
1685 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1692 else if (c1 <= 0xC3) {
1693 /* trail byte or invalid */
1696 else if (c1 <= 0xDF) {
1698 wc = (c1 & 0x1F) << 6;
1701 else if (c1 <= 0xEF) {
1703 wc = (c1 & 0x0F) << 12;
1704 wc |= (c2 & 0x3F) << 6;
1707 else if (c2 <= 0xF4) {
1709 wc = (c1 & 0x0F) << 18;
1710 wc |= (c2 & 0x3F) << 12;
1711 wc |= (c3 & 0x3F) << 6;
1721 #ifdef UTF8_INPUT_ENABLE
1723 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1724 const unsigned short *const *pp, nkf_char psize,
1725 nkf_char *p2, nkf_char *p1)
1728 const unsigned short *p;
1731 if (pp == 0) return 1;
1734 if (c1 < 0 || psize <= c1) return 1;
1736 if (p == 0) return 1;
1739 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1741 if (val == 0) return 1;
1742 if (no_cp932ext_f && (
1743 (val>>8) == 0x2D || /* NEC special characters */
1744 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1752 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1760 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1762 const unsigned short *const *pp;
1763 const unsigned short *const *const *ppp;
1764 static const char no_best_fit_chars_table_C2[] =
1765 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1766 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1767 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1768 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1769 static const char no_best_fit_chars_table_C2_ms[] =
1770 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1771 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1772 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1773 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1774 static const char no_best_fit_chars_table_932_C2[] =
1775 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1776 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1777 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1778 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1779 static const char no_best_fit_chars_table_932_C3[] =
1780 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1781 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1783 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1789 }else if(c2 < 0xe0){
1790 if(no_best_fit_chars_f){
1791 if(ms_ucs_map_f == UCS_MAP_CP932){
1794 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1797 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1800 }else if(!cp932inv_f){
1803 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1806 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1809 }else if(ms_ucs_map_f == UCS_MAP_MS){
1810 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1811 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1829 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1830 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1831 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1833 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1834 }else if(c0 < 0xF0){
1835 if(no_best_fit_chars_f){
1836 if(ms_ucs_map_f == UCS_MAP_CP932){
1837 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1838 }else if(ms_ucs_map_f == UCS_MAP_MS){
1843 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1846 if(c0 == 0x92) return 1;
1851 if(c1 == 0x80 || c0 == 0x9C) return 1;
1854 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1859 if(c0 == 0x94) return 1;
1862 if(c0 == 0xBB) return 1;
1872 if(c0 == 0x95) return 1;
1875 if(c0 == 0xA5) return 1;
1882 if(c0 == 0x8D) return 1;
1885 if(c0 == 0x9E && !cp932inv_f) return 1;
1888 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1896 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1897 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1898 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1900 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1902 #ifdef SHIFTJIS_CP932
1903 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1905 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1906 s2e_conv(s2, s1, p2, p1);
1915 #ifdef UTF8_OUTPUT_ENABLE
1917 e2w_conv(nkf_char c2, nkf_char c1)
1919 const unsigned short *p;
1921 if (c2 == JIS_X_0201_1976_K) {
1922 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1930 p = euc_to_utf8_1byte;
1932 } else if (is_eucg3(c2)){
1933 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1936 c2 = (c2&0x7f) - 0x21;
1937 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1938 p = x0212_to_utf8_2bytes[c2];
1944 c2 = (c2&0x7f) - 0x21;
1945 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1947 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1948 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1949 euc_to_utf8_2bytes_ms[c2];
1954 c1 = (c1 & 0x7f) - 0x21;
1955 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1962 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1969 }else if (0xc0 <= c2 && c2 <= 0xef) {
1970 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1971 #ifdef NUMCHAR_OPTION
1974 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1982 #ifdef UTF8_INPUT_ENABLE
1984 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1986 nkf_char c1, c2, c3, c4;
1993 else if (nkf_char_unicode_bmp_p(val)){
1994 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1995 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1998 *p1 = nkf_char_unicode_new(val);
2004 *p1 = nkf_char_unicode_new(val);
2011 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2013 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2014 if (iso2022jp_f && !x0201_f) {
2015 c2 = GETA1; c1 = GETA2;
2017 c2 = JIS_X_0201_1976_K;
2021 }else if (c2 == 0x8f){
2025 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2026 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2027 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2030 c2 = (c2 << 8) | (c1 & 0x7f);
2032 #ifdef SHIFTJIS_CP932
2035 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2036 s2e_conv(s2, s1, &c2, &c1);
2043 #endif /* SHIFTJIS_CP932 */
2045 #endif /* X0212_ENABLE */
2046 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2049 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2050 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2051 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2056 #ifdef SHIFTJIS_CP932
2057 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2059 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2060 s2e_conv(s2, s1, &c2, &c1);
2067 #endif /* SHIFTJIS_CP932 */
2075 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2077 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2078 if (iso2022jp_f && !x0201_f) {
2079 c2 = GETA1; c1 = GETA2;
2083 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2085 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2087 if(c1 == 0x7F) return 0;
2088 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2091 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2092 if (ret) return ret;
2099 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2101 nkf_char ret = 0, c4 = 0;
2102 static const char w_iconv_utf8_1st_byte[] =
2104 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2105 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2106 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2107 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2114 if (c1 < 0 || 0xff < c1) {
2115 }else if (c1 == 0) { /* 0 : 1 byte*/
2117 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2120 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2122 if (c2 < 0x80 || 0xBF < c2) return 0;
2125 if (c3 == 0) return -1;
2126 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2131 if (c3 == 0) return -1;
2132 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2136 if (c3 == 0) return -1;
2137 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2141 if (c3 == 0) return -2;
2142 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2146 if (c3 == 0) return -2;
2147 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2151 if (c3 == 0) return -2;
2152 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2160 if (c1 == 0 || c1 == EOF){
2161 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2162 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2165 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2173 #define NKF_ICONV_INVALID_CODE_RANGE -13
2175 unicode_iconv(nkf_char wc)
2183 }else if ((wc>>11) == 27) {
2184 /* unpaired surrogate */
2185 return NKF_ICONV_INVALID_CODE_RANGE;
2186 }else if (wc < 0xFFFF) {
2187 ret = w16e_conv(wc, &c2, &c1);
2188 if (ret) return ret;
2189 }else if (wc < 0x10FFFF) {
2191 c1 = nkf_char_unicode_new(wc);
2193 return NKF_ICONV_INVALID_CODE_RANGE;
2199 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2200 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2201 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2203 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2212 if (input_endian == ENDIAN_BIG) {
2213 if (0xD8 <= c1 && c1 <= 0xDB) {
2214 if (0xDC <= c3 && c3 <= 0xDF) {
2215 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2216 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2221 if (0xD8 <= c2 && c2 <= 0xDB) {
2222 if (0xDC <= c4 && c4 <= 0xDF) {
2223 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2224 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2230 return (*unicode_iconv)(wc);
2234 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2240 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2246 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2255 switch(input_endian){
2257 wc = c2 << 16 | c3 << 8 | c4;
2260 wc = c3 << 16 | c2 << 8 | c1;
2263 wc = c1 << 16 | c4 << 8 | c3;
2266 wc = c4 << 16 | c1 << 8 | c2;
2269 return NKF_ICONV_INVALID_CODE_RANGE;
2272 return (*unicode_iconv)(wc);
2276 #define output_ascii_escape_sequence(mode) do { \
2277 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2280 (*o_putc)(ascii_intro); \
2281 output_mode = mode; \
2286 output_escape_sequence(int mode)
2288 if (output_mode == mode)
2296 case JIS_X_0201_1976_K:
2304 (*o_putc)(kanji_intro);
2329 j_oconv(nkf_char c2, nkf_char c1)
2331 #ifdef NUMCHAR_OPTION
2332 if (c2 == 0 && nkf_char_unicode_p(c1)){
2333 w16e_conv(c1, &c2, &c1);
2334 if (c2 == 0 && nkf_char_unicode_p(c1)){
2335 c2 = c1 & VALUE_MASK;
2336 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2339 c2 = 0x7F + c1 / 94;
2340 c1 = 0x21 + c1 % 94;
2342 if (encode_fallback) (*encode_fallback)(c1);
2349 output_ascii_escape_sequence(ASCII);
2352 else if (c2 == EOF) {
2353 output_ascii_escape_sequence(ASCII);
2356 else if (c2 == ISO_8859_1) {
2357 output_ascii_escape_sequence(ISO_8859_1);
2360 else if (c2 == JIS_X_0201_1976_K) {
2361 output_escape_sequence(JIS_X_0201_1976_K);
2364 } else if (is_eucg3(c2)){
2365 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2366 (*o_putc)(c2 & 0x7f);
2371 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2372 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2373 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2380 e_oconv(nkf_char c2, nkf_char c1)
2382 if (c2 == 0 && nkf_char_unicode_p(c1)){
2383 w16e_conv(c1, &c2, &c1);
2384 if (c2 == 0 && nkf_char_unicode_p(c1)){
2385 c2 = c1 & VALUE_MASK;
2386 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2390 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2391 c1 = 0x21 + c1 % 94;
2394 (*o_putc)((c2 & 0x7f) | 0x080);
2395 (*o_putc)(c1 | 0x080);
2397 (*o_putc)((c2 & 0x7f) | 0x080);
2398 (*o_putc)(c1 | 0x080);
2402 if (encode_fallback) (*encode_fallback)(c1);
2410 } else if (c2 == 0) {
2411 output_mode = ASCII;
2413 } else if (c2 == JIS_X_0201_1976_K) {
2414 output_mode = EUC_JP;
2415 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2416 } else if (c2 == ISO_8859_1) {
2417 output_mode = ISO_8859_1;
2418 (*o_putc)(c1 | 0x080);
2420 } else if (is_eucg3(c2)){
2421 output_mode = EUC_JP;
2422 #ifdef SHIFTJIS_CP932
2425 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2426 s2e_conv(s2, s1, &c2, &c1);
2431 output_mode = ASCII;
2433 }else if (is_eucg3(c2)){
2436 (*o_putc)((c2 & 0x7f) | 0x080);
2437 (*o_putc)(c1 | 0x080);
2440 (*o_putc)((c2 & 0x7f) | 0x080);
2441 (*o_putc)(c1 | 0x080);
2445 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2446 set_iconv(FALSE, 0);
2447 return; /* too late to rescue this char */
2449 output_mode = EUC_JP;
2450 (*o_putc)(c2 | 0x080);
2451 (*o_putc)(c1 | 0x080);
2456 s_oconv(nkf_char c2, nkf_char c1)
2458 #ifdef NUMCHAR_OPTION
2459 if (c2 == 0 && nkf_char_unicode_p(c1)){
2460 w16e_conv(c1, &c2, &c1);
2461 if (c2 == 0 && nkf_char_unicode_p(c1)){
2462 c2 = c1 & VALUE_MASK;
2463 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2466 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2468 c1 += 0x40 + (c1 > 0x3e);
2473 if(encode_fallback)(*encode_fallback)(c1);
2482 } else if (c2 == 0) {
2483 output_mode = ASCII;
2485 } else if (c2 == JIS_X_0201_1976_K) {
2486 output_mode = SHIFT_JIS;
2488 } else if (c2 == ISO_8859_1) {
2489 output_mode = ISO_8859_1;
2490 (*o_putc)(c1 | 0x080);
2492 } else if (is_eucg3(c2)){
2493 output_mode = SHIFT_JIS;
2494 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2500 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2501 set_iconv(FALSE, 0);
2502 return; /* too late to rescue this char */
2504 output_mode = SHIFT_JIS;
2505 e2s_conv(c2, c1, &c2, &c1);
2507 #ifdef SHIFTJIS_CP932
2509 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2510 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2516 #endif /* SHIFTJIS_CP932 */
2519 if (prefix_table[(unsigned char)c1]){
2520 (*o_putc)(prefix_table[(unsigned char)c1]);
2526 #ifdef UTF8_OUTPUT_ENABLE
2528 w_oconv(nkf_char c2, nkf_char c1)
2534 output_bom_f = FALSE;
2545 if (c2 == 0 && nkf_char_unicode_p(c1)){
2546 val = c1 & VALUE_MASK;
2547 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2549 if (c2) (*o_putc)(c2);
2550 if (c3) (*o_putc)(c3);
2551 if (c4) (*o_putc)(c4);
2558 val = e2w_conv(c2, c1);
2560 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2562 if (c2) (*o_putc)(c2);
2563 if (c3) (*o_putc)(c3);
2564 if (c4) (*o_putc)(c4);
2570 w_oconv16(nkf_char c2, nkf_char c1)
2573 output_bom_f = FALSE;
2574 if (output_endian == ENDIAN_LITTLE){
2588 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2589 if (nkf_char_unicode_bmp_p(c1)) {
2590 c2 = (c1 >> 8) & 0xff;
2594 if (c1 <= UNICODE_MAX) {
2595 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2596 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2597 if (output_endian == ENDIAN_LITTLE){
2598 (*o_putc)(c2 & 0xff);
2599 (*o_putc)((c2 >> 8) & 0xff);
2600 (*o_putc)(c1 & 0xff);
2601 (*o_putc)((c1 >> 8) & 0xff);
2603 (*o_putc)((c2 >> 8) & 0xff);
2604 (*o_putc)(c2 & 0xff);
2605 (*o_putc)((c1 >> 8) & 0xff);
2606 (*o_putc)(c1 & 0xff);
2612 nkf_char val = e2w_conv(c2, c1);
2613 c2 = (val >> 8) & 0xff;
2618 if (output_endian == ENDIAN_LITTLE){
2628 w_oconv32(nkf_char c2, nkf_char c1)
2631 output_bom_f = FALSE;
2632 if (output_endian == ENDIAN_LITTLE){
2650 if (c2 == ISO_8859_1) {
2652 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2655 c1 = e2w_conv(c2, c1);
2658 if (output_endian == ENDIAN_LITTLE){
2659 (*o_putc)( c1 & 0xFF);
2660 (*o_putc)((c1 >> 8) & 0xFF);
2661 (*o_putc)((c1 >> 16) & 0xFF);
2665 (*o_putc)((c1 >> 16) & 0xFF);
2666 (*o_putc)((c1 >> 8) & 0xFF);
2667 (*o_putc)( c1 & 0xFF);
2672 #define SCORE_L2 (1) /* Kanji Level 2 */
2673 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2674 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2675 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2676 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2677 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */
2678 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2679 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2681 #define SCORE_INIT (SCORE_iMIME)
2683 static const nkf_char score_table_A0[] = {
2686 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2687 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2690 static const nkf_char score_table_F0[] = {
2691 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2692 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2693 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2694 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2698 set_code_score(struct input_code *ptr, nkf_char score)
2701 ptr->score |= score;
2706 clr_code_score(struct input_code *ptr, nkf_char score)
2709 ptr->score &= ~score;
2714 code_score(struct input_code *ptr)
2716 nkf_char c2 = ptr->buf[0];
2717 #ifdef UTF8_OUTPUT_ENABLE
2718 nkf_char c1 = ptr->buf[1];
2721 set_code_score(ptr, SCORE_ERROR);
2722 }else if (c2 == SS2){
2723 set_code_score(ptr, SCORE_KANA);
2724 }else if (c2 == 0x8f){
2725 set_code_score(ptr, SCORE_X0212);
2726 #ifdef UTF8_OUTPUT_ENABLE
2727 }else if (!e2w_conv(c2, c1)){
2728 set_code_score(ptr, SCORE_NO_EXIST);
2730 }else if ((c2 & 0x70) == 0x20){
2731 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2732 }else if ((c2 & 0x70) == 0x70){
2733 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2734 }else if ((c2 & 0x70) >= 0x50){
2735 set_code_score(ptr, SCORE_L2);
2740 status_disable(struct input_code *ptr)
2745 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2749 status_push_ch(struct input_code *ptr, nkf_char c)
2751 ptr->buf[ptr->index++] = c;
2755 status_clear(struct input_code *ptr)
2762 status_reset(struct input_code *ptr)
2765 ptr->score = SCORE_INIT;
2769 status_reinit(struct input_code *ptr)
2772 ptr->_file_stat = 0;
2776 status_check(struct input_code *ptr, nkf_char c)
2778 if (c <= DEL && estab_f){
2784 s_status(struct input_code *ptr, nkf_char c)
2788 status_check(ptr, c);
2793 }else if (nkf_char_unicode_p(c)){
2795 }else if (0xa1 <= c && c <= 0xdf){
2796 status_push_ch(ptr, SS2);
2797 status_push_ch(ptr, c);
2800 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2802 status_push_ch(ptr, c);
2803 }else if (0xed <= c && c <= 0xee){
2805 status_push_ch(ptr, c);
2806 #ifdef SHIFTJIS_CP932
2807 }else if (is_ibmext_in_sjis(c)){
2809 status_push_ch(ptr, c);
2810 #endif /* SHIFTJIS_CP932 */
2812 }else if (0xf0 <= c && c <= 0xfc){
2814 status_push_ch(ptr, c);
2815 #endif /* X0212_ENABLE */
2817 status_disable(ptr);
2821 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2822 status_push_ch(ptr, c);
2823 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2827 status_disable(ptr);
2831 #ifdef SHIFTJIS_CP932
2832 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2833 status_push_ch(ptr, c);
2834 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2835 set_code_score(ptr, SCORE_CP932);
2840 #endif /* SHIFTJIS_CP932 */
2841 status_disable(ptr);
2844 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2845 status_push_ch(ptr, c);
2846 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2847 set_code_score(ptr, SCORE_CP932);
2850 status_disable(ptr);
2857 e_status(struct input_code *ptr, nkf_char c)
2861 status_check(ptr, c);
2866 }else if (nkf_char_unicode_p(c)){
2868 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2870 status_push_ch(ptr, c);
2872 }else if (0x8f == c){
2874 status_push_ch(ptr, c);
2875 #endif /* X0212_ENABLE */
2877 status_disable(ptr);
2881 if (0xa1 <= c && c <= 0xfe){
2882 status_push_ch(ptr, c);
2886 status_disable(ptr);
2891 if (0xa1 <= c && c <= 0xfe){
2893 status_push_ch(ptr, c);
2895 status_disable(ptr);
2897 #endif /* X0212_ENABLE */
2901 #ifdef UTF8_INPUT_ENABLE
2903 w_status(struct input_code *ptr, nkf_char c)
2907 status_check(ptr, c);
2912 }else if (nkf_char_unicode_p(c)){
2914 }else if (0xc0 <= c && c <= 0xdf){
2916 status_push_ch(ptr, c);
2917 }else if (0xe0 <= c && c <= 0xef){
2919 status_push_ch(ptr, c);
2920 }else if (0xf0 <= c && c <= 0xf4){
2922 status_push_ch(ptr, c);
2924 status_disable(ptr);
2929 if (0x80 <= c && c <= 0xbf){
2930 status_push_ch(ptr, c);
2931 if (ptr->index > ptr->stat){
2932 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2933 && ptr->buf[2] == 0xbf);
2934 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2935 &ptr->buf[0], &ptr->buf[1]);
2942 status_disable(ptr);
2946 if (0x80 <= c && c <= 0xbf){
2947 if (ptr->index < ptr->stat){
2948 status_push_ch(ptr, c);
2953 status_disable(ptr);
2961 code_status(nkf_char c)
2963 int action_flag = 1;
2964 struct input_code *result = 0;
2965 struct input_code *p = input_code_list;
2967 if (!p->status_func) {
2971 if (!p->status_func)
2973 (p->status_func)(p, c);
2976 }else if(p->stat == 0){
2987 if (result && !estab_f){
2988 set_iconv(TRUE, result->iconv_func);
2989 }else if (c <= DEL){
2990 struct input_code *ptr = input_code_list;
3000 nkf_buf_t *std_gc_buf;
3001 nkf_char broken_state;
3002 nkf_buf_t *broken_buf;
3003 nkf_char mimeout_state;
3007 static nkf_state_t *nkf_state = NULL;
3009 #define STD_GC_BUFSIZE (256)
3012 nkf_state_init(void)
3015 nkf_buf_clear(nkf_state->std_gc_buf);
3016 nkf_buf_clear(nkf_state->broken_buf);
3017 nkf_buf_clear(nkf_state->nfc_buf);
3020 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3021 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3022 nkf_state->broken_buf = nkf_buf_new(3);
3023 nkf_state->nfc_buf = nkf_buf_new(9);
3025 nkf_state->broken_state = 0;
3026 nkf_state->mimeout_state = 0;
3033 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3034 return nkf_buf_pop(nkf_state->std_gc_buf);
3041 std_ungetc(nkf_char c, FILE *f)
3043 nkf_buf_push(nkf_state->std_gc_buf, c);
3049 std_putc(nkf_char c)
3056 static unsigned char hold_buf[HOLD_SIZE*2];
3057 static int hold_count = 0;
3059 push_hold_buf(nkf_char c2)
3061 if (hold_count >= HOLD_SIZE*2)
3063 hold_buf[hold_count++] = (unsigned char)c2;
3064 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3068 h_conv(FILE *f, nkf_char c1, nkf_char c2)
3074 /** it must NOT be in the kanji shifte sequence */
3075 /** it must NOT be written in JIS7 */
3076 /** and it must be after 2 byte 8bit code */
3082 while ((c2 = (*i_getc)(f)) != EOF) {
3088 if (push_hold_buf(c2) == EOF || estab_f) {
3094 struct input_code *p = input_code_list;
3095 struct input_code *result = p;
3100 if (p->status_func && p->score < result->score) {
3105 set_iconv(TRUE, result->iconv_func);
3110 ** 1) EOF is detected, or
3111 ** 2) Code is established, or
3112 ** 3) Buffer is FULL (but last word is pushed)
3114 ** in 1) and 3) cases, we continue to use
3115 ** Kanji codes by oconv and leave estab_f unchanged.
3120 while (hold_index < hold_count){
3121 c1 = hold_buf[hold_index++];
3125 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3126 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3129 if (hold_index < hold_count){
3130 c2 = hold_buf[hold_index++];
3140 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3143 if (hold_index < hold_count){
3144 c3 = hold_buf[hold_index++];
3145 } else if ((c3 = (*i_getc)(f)) == EOF) {
3150 if (hold_index < hold_count){
3151 c4 = hold_buf[hold_index++];
3152 } else if ((c4 = (*i_getc)(f)) == EOF) {
3157 (*iconv)(c1, c2, (c3<<8)|c4);
3160 /* 3 bytes EUC or UTF-8 */
3161 if (hold_index < hold_count){
3162 c3 = hold_buf[hold_index++];
3163 } else if ((c3 = (*i_getc)(f)) == EOF) {
3169 (*iconv)(c1, c2, c3);
3172 if (c3 == EOF) break;
3178 * Check and Ignore BOM
3184 switch(c2 = (*i_getc)(f)){
3186 if((c2 = (*i_getc)(f)) == 0x00){
3187 if((c2 = (*i_getc)(f)) == 0xFE){
3188 if((c2 = (*i_getc)(f)) == 0xFF){
3189 if(!input_encoding){
3190 set_iconv(TRUE, w_iconv32);
3192 if (iconv == w_iconv32) {
3193 input_endian = ENDIAN_BIG;
3196 (*i_ungetc)(0xFF,f);
3197 }else (*i_ungetc)(c2,f);
3198 (*i_ungetc)(0xFE,f);
3199 }else if(c2 == 0xFF){
3200 if((c2 = (*i_getc)(f)) == 0xFE){
3201 if(!input_encoding){
3202 set_iconv(TRUE, w_iconv32);
3204 if (iconv == w_iconv32) {
3205 input_endian = ENDIAN_2143;
3208 (*i_ungetc)(0xFF,f);
3209 }else (*i_ungetc)(c2,f);
3210 (*i_ungetc)(0xFF,f);
3211 }else (*i_ungetc)(c2,f);
3212 (*i_ungetc)(0x00,f);
3213 }else (*i_ungetc)(c2,f);
3214 (*i_ungetc)(0x00,f);
3217 if((c2 = (*i_getc)(f)) == 0xBB){
3218 if((c2 = (*i_getc)(f)) == 0xBF){
3219 if(!input_encoding){
3220 set_iconv(TRUE, w_iconv);
3222 if (iconv == w_iconv) {
3225 (*i_ungetc)(0xBF,f);
3226 }else (*i_ungetc)(c2,f);
3227 (*i_ungetc)(0xBB,f);
3228 }else (*i_ungetc)(c2,f);
3229 (*i_ungetc)(0xEF,f);
3232 if((c2 = (*i_getc)(f)) == 0xFF){
3233 if((c2 = (*i_getc)(f)) == 0x00){
3234 if((c2 = (*i_getc)(f)) == 0x00){
3235 if(!input_encoding){
3236 set_iconv(TRUE, w_iconv32);
3238 if (iconv == w_iconv32) {
3239 input_endian = ENDIAN_3412;
3242 (*i_ungetc)(0x00,f);
3243 }else (*i_ungetc)(c2,f);
3244 (*i_ungetc)(0x00,f);
3245 }else (*i_ungetc)(c2,f);
3246 if(!input_encoding){
3247 set_iconv(TRUE, w_iconv16);
3249 if (iconv == w_iconv16) {
3250 input_endian = ENDIAN_BIG;
3253 (*i_ungetc)(0xFF,f);
3254 }else (*i_ungetc)(c2,f);
3255 (*i_ungetc)(0xFE,f);
3258 if((c2 = (*i_getc)(f)) == 0xFE){
3259 if((c2 = (*i_getc)(f)) == 0x00){
3260 if((c2 = (*i_getc)(f)) == 0x00){
3261 if(!input_encoding){
3262 set_iconv(TRUE, w_iconv32);
3264 if (iconv == w_iconv32) {
3265 input_endian = ENDIAN_LITTLE;
3268 (*i_ungetc)(0x00,f);
3269 }else (*i_ungetc)(c2,f);
3270 (*i_ungetc)(0x00,f);
3271 }else (*i_ungetc)(c2,f);
3272 if(!input_encoding){
3273 set_iconv(TRUE, w_iconv16);
3275 if (iconv == w_iconv16) {
3276 input_endian = ENDIAN_LITTLE;
3279 (*i_ungetc)(0xFE,f);
3280 }else (*i_ungetc)(c2,f);
3281 (*i_ungetc)(0xFF,f);
3290 broken_getc(FILE *f)
3294 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3295 return nkf_buf_pop(nkf_state->broken_buf);
3298 if (c=='$' && nkf_state->broken_state != ESC
3299 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3301 nkf_state->broken_state = 0;
3302 if (c1=='@'|| c1=='B') {
3303 nkf_buf_push(nkf_state->broken_buf, c1);
3304 nkf_buf_push(nkf_state->broken_buf, c);
3310 } else if (c=='(' && nkf_state->broken_state != ESC
3311 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3313 nkf_state->broken_state = 0;
3314 if (c1=='J'|| c1=='B') {
3315 nkf_buf_push(nkf_state->broken_buf, c1);
3316 nkf_buf_push(nkf_state->broken_buf, c);
3323 nkf_state->broken_state = c;
3329 broken_ungetc(nkf_char c, FILE *f)
3331 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3332 nkf_buf_push(nkf_state->broken_buf, c);
3337 eol_conv(nkf_char c2, nkf_char c1)
3339 if (guess_f && input_eol != EOF) {
3340 if (c2 == 0 && c1 == LF) {
3341 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3342 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3343 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3345 else if (!input_eol) input_eol = CR;
3346 else if (input_eol != CR) input_eol = EOF;
3348 if (prev_cr || (c2 == 0 && c1 == LF)) {
3350 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3351 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3353 if (c2 == 0 && c1 == CR) prev_cr = CR;
3354 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3358 Return value of fold_conv()
3360 LF add newline and output char
3361 CR add newline and output nothing
3364 1 (or else) normal output
3366 fold state in prev (previous character)
3368 >0x80 Japanese (X0208/X0201)
3373 This fold algorthm does not preserve heading space in a line.
3374 This is the main difference from fmt.
3377 #define char_size(c2,c1) (c2?2:1)
3380 fold_conv(nkf_char c2, nkf_char c1)
3383 nkf_char fold_state;
3385 if (c1== CR && !fold_preserve_f) {
3386 fold_state=0; /* ignore cr */
3387 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3389 fold_state=0; /* ignore cr */
3390 } else if (c1== BS) {
3391 if (f_line>0) f_line--;
3393 } else if (c2==EOF && f_line != 0) { /* close open last line */
3395 } else if ((c1==LF && !fold_preserve_f)
3396 || ((c1==CR||(c1==LF&&f_prev!=CR))
3397 && fold_preserve_f)) {
3399 if (fold_preserve_f) {
3403 } else if ((f_prev == c1 && !fold_preserve_f)
3404 || (f_prev == LF && fold_preserve_f)
3405 ) { /* duplicate newline */
3408 fold_state = LF; /* output two newline */
3414 if (f_prev&0x80) { /* Japanese? */
3416 fold_state = 0; /* ignore given single newline */
3417 } else if (f_prev==SP) {
3421 if (++f_line<=fold_len)
3425 fold_state = CR; /* fold and output nothing */
3429 } else if (c1=='\f') {
3432 fold_state = LF; /* output newline and clear */
3433 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3434 /* X0208 kankaku or ascii space */
3436 fold_state = 0; /* remove duplicate spaces */
3439 if (++f_line<=fold_len)
3440 fold_state = SP; /* output ASCII space only */
3442 f_prev = SP; f_line = 0;
3443 fold_state = CR; /* fold and output nothing */
3447 prev0 = f_prev; /* we still need this one... , but almost done */
3449 if (c2 || c2 == JIS_X_0201_1976_K)
3450 f_prev |= 0x80; /* this is Japanese */
3451 f_line += char_size(c2,c1);
3452 if (f_line<=fold_len) { /* normal case */
3455 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3456 f_line = char_size(c2,c1);
3457 fold_state = LF; /* We can't wait, do fold now */
3458 } else if (c2 == JIS_X_0201_1976_K) {
3459 /* simple kinsoku rules return 1 means no folding */
3460 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3461 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3462 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3463 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3464 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3465 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3466 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3468 fold_state = LF;/* add one new f_line before this character */
3471 fold_state = LF;/* add one new f_line before this character */
3474 /* kinsoku point in ASCII */
3475 if ( c1==')'|| /* { [ ( */
3486 /* just after special */
3487 } else if (!is_alnum(prev0)) {
3488 f_line = char_size(c2,c1);
3490 } else if ((prev0==SP) || /* ignored new f_line */
3491 (prev0==LF)|| /* ignored new f_line */
3492 (prev0&0x80)) { /* X0208 - ASCII */
3493 f_line = char_size(c2,c1);
3494 fold_state = LF;/* add one new f_line before this character */
3496 fold_state = 1; /* default no fold in ASCII */
3500 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3501 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3502 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3503 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3504 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3505 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3506 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3507 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3508 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3509 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3510 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3511 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3512 /* default no fold in kinsoku */
3515 f_line = char_size(c2,c1);
3516 /* add one new f_line before this character */
3519 f_line = char_size(c2,c1);
3521 /* add one new f_line before this character */
3526 /* terminator process */
3527 switch(fold_state) {
3529 OCONV_NEWLINE((*o_fconv));
3535 OCONV_NEWLINE((*o_fconv));
3546 static nkf_char z_prev2=0,z_prev1=0;
3549 z_conv(nkf_char c2, nkf_char c1)
3552 /* if (c2) c1 &= 0x7f; assertion */
3554 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3560 if (z_prev2 == JIS_X_0201_1976_K) {
3561 if (c2 == JIS_X_0201_1976_K) {
3562 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3564 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3566 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3568 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3573 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3575 if (c2 == JIS_X_0201_1976_K) {
3576 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3577 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3582 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3593 if (alpha_f&1 && c2 == 0x23) {
3594 /* JISX0208 Alphabet */
3596 } else if (c2 == 0x21) {
3597 /* JISX0208 Kigou */
3602 } else if (alpha_f&4) {
3607 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3613 if (alpha_f&8 && c2 == 0) {
3615 const char *entity = 0;
3617 case '>': entity = ">"; break;
3618 case '<': entity = "<"; break;
3619 case '\"': entity = """; break;
3620 case '&': entity = "&"; break;
3623 while (*entity) (*o_zconv)(0, *entity++);
3629 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3634 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3638 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3642 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3646 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3650 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3654 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3658 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3662 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3667 (*o_zconv)(JIS_X_0201_1976_K, c);
3670 } else if (c2 == 0x25) {
3671 /* JISX0208 Katakana */
3672 static const int fullwidth_to_halfwidth[] =
3674 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3675 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3676 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3677 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3678 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3679 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3680 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3681 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3682 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3683 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3684 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3685 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3687 if (fullwidth_to_halfwidth[c1-0x20]){
3688 c2 = fullwidth_to_halfwidth[c1-0x20];
3689 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3691 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3701 #define rot13(c) ( \
3703 (c <= 'M') ? (c + 13): \
3704 (c <= 'Z') ? (c - 13): \
3706 (c <= 'm') ? (c + 13): \
3707 (c <= 'z') ? (c - 13): \
3711 #define rot47(c) ( \
3713 ( c <= 'O') ? (c + 47) : \
3714 ( c <= '~') ? (c - 47) : \
3719 rot_conv(nkf_char c2, nkf_char c1)
3721 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3727 (*o_rot_conv)(c2,c1);
3731 hira_conv(nkf_char c2, nkf_char c1)
3735 if (0x20 < c1 && c1 < 0x74) {
3737 (*o_hira_conv)(c2,c1);
3739 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3741 c1 = nkf_char_unicode_new(0x3094);
3742 (*o_hira_conv)(c2,c1);
3745 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3747 (*o_hira_conv)(c2,c1);
3752 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3755 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3757 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3761 (*o_hira_conv)(c2,c1);
3766 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3768 #define RANGE_NUM_MAX 18
3769 static const nkf_char range[RANGE_NUM_MAX][2] = {
3790 nkf_char start, end, c;
3792 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3796 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3801 for (i = 0; i < RANGE_NUM_MAX; i++) {
3802 start = range[i][0];
3805 if (c >= start && c <= end) {
3810 (*o_iso2022jp_check_conv)(c2,c1);
3814 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3816 static const unsigned char *mime_pattern[] = {
3817 (const unsigned char *)"\075?EUC-JP?B?",
3818 (const unsigned char *)"\075?SHIFT_JIS?B?",
3819 (const unsigned char *)"\075?ISO-8859-1?Q?",
3820 (const unsigned char *)"\075?ISO-8859-1?B?",
3821 (const unsigned char *)"\075?ISO-2022-JP?B?",
3822 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3823 #if defined(UTF8_INPUT_ENABLE)
3824 (const unsigned char *)"\075?UTF-8?B?",
3825 (const unsigned char *)"\075?UTF-8?Q?",
3827 (const unsigned char *)"\075?US-ASCII?Q?",
3832 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3833 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3834 e_iconv, s_iconv, 0, 0, 0, 0,
3835 #if defined(UTF8_INPUT_ENABLE)
3841 static const nkf_char mime_encode[] = {
3842 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3843 #if defined(UTF8_INPUT_ENABLE)
3850 static const nkf_char mime_encode_method[] = {
3851 'B', 'B','Q', 'B', 'B', 'Q',
3852 #if defined(UTF8_INPUT_ENABLE)
3860 /* MIME preprocessor fifo */
3862 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3863 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3864 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3866 unsigned char buf[MIME_BUF_SIZE];
3868 unsigned int last; /* decoded */
3869 unsigned int input; /* undecoded */
3871 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3873 #define MAXRECOVER 20
3876 mime_input_buf_unshift(nkf_char c)
3878 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3882 mime_ungetc(nkf_char c, FILE *f)
3884 mime_input_buf_unshift(c);
3889 mime_ungetc_buf(nkf_char c, FILE *f)
3892 (*i_mungetc_buf)(c,f);
3894 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3899 mime_getc_buf(FILE *f)
3901 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3902 a terminator. It was checked in mime_integrity. */
3903 return ((mimebuf_f)?
3904 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3908 switch_mime_getc(void)
3910 if (i_getc!=mime_getc) {
3911 i_mgetc = i_getc; i_getc = mime_getc;
3912 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3913 if(mime_f==STRICT_MIME) {
3914 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3915 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3921 unswitch_mime_getc(void)
3923 if(mime_f==STRICT_MIME) {
3924 i_mgetc = i_mgetc_buf;
3925 i_mungetc = i_mungetc_buf;
3928 i_ungetc = i_mungetc;
3929 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3930 mime_iconv_back = NULL;
3934 mime_integrity(FILE *f, const unsigned char *p)
3938 /* In buffered mode, read until =? or NL or buffer full
3940 mime_input_state.input = mime_input_state.top;
3941 mime_input_state.last = mime_input_state.top;
3943 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3945 q = mime_input_state.input;
3946 while((c=(*i_getc)(f))!=EOF) {
3947 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3948 break; /* buffer full */
3950 if (c=='=' && d=='?') {
3951 /* checked. skip header, start decode */
3952 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3953 /* mime_last_input = mime_input_state.input; */
3954 mime_input_state.input = q;
3958 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3960 /* Should we check length mod 4? */
3961 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3964 /* In case of Incomplete MIME, no MIME decode */
3965 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3966 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3967 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3968 switch_mime_getc(); /* anyway we need buffered getc */
3973 mime_begin_strict(FILE *f)
3977 const unsigned char *p,*q;
3978 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3980 mime_decode_mode = FALSE;
3981 /* =? has been checked */
3983 p = mime_pattern[j];
3986 for(i=2;p[i]>SP;i++) { /* start at =? */
3987 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3988 /* pattern fails, try next one */
3990 while (mime_pattern[++j]) {
3991 p = mime_pattern[j];
3992 for(k=2;k<i;k++) /* assume length(p) > i */
3993 if (p[k]!=q[k]) break;
3994 if (k==i && nkf_toupper(c1)==p[k]) break;
3996 p = mime_pattern[j];
3997 if (p) continue; /* found next one, continue */
3998 /* all fails, output from recovery buffer */
4006 mime_decode_mode = p[i-2];
4008 mime_iconv_back = iconv;
4009 set_iconv(FALSE, mime_priority_func[j]);
4010 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4012 if (mime_decode_mode=='B') {
4013 mimebuf_f = unbuf_f;
4015 /* do MIME integrity check */
4016 return mime_integrity(f,mime_pattern[j]);
4030 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4031 /* re-read and convert again from mime_buffer. */
4033 /* =? has been checked */
4034 k = mime_input_state.last;
4035 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4036 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4037 /* We accept any character type even if it is breaked by new lines */
4038 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4039 if (c1==LF||c1==SP||c1==CR||
4040 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4042 /* Failed. But this could be another MIME preemble */
4044 mime_input_state.last--;
4050 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4051 if (!(++i<MAXRECOVER) || c1==EOF) break;
4052 if (c1=='b'||c1=='B') {
4053 mime_decode_mode = 'B';
4054 } else if (c1=='q'||c1=='Q') {
4055 mime_decode_mode = 'Q';
4059 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4060 if (!(++i<MAXRECOVER) || c1==EOF) break;
4062 mime_decode_mode = FALSE;
4068 if (!mime_decode_mode) {
4069 /* false MIME premble, restart from mime_buffer */
4070 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4071 /* Since we are in MIME mode until buffer becomes empty, */
4072 /* we never go into mime_begin again for a while. */
4075 /* discard mime preemble, and goto MIME mode */
4076 mime_input_state.last = k;
4077 /* do no MIME integrity check */
4078 return c1; /* used only for checking EOF */
4089 debug(const char *str)
4092 fprintf(stderr, "%s\n", str ? str : "NULL");
4098 set_input_codename(const char *codename)
4100 if (!input_codename) {
4101 input_codename = codename;
4102 } else if (strcmp(codename, input_codename) != 0) {
4103 input_codename = "";
4108 get_guessed_code(void)
4110 if (input_codename && !*input_codename) {
4111 input_codename = "BINARY";
4113 struct input_code *p = find_inputcode_byfunc(iconv);
4114 if (!input_codename) {
4115 input_codename = "ASCII";
4116 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4117 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4118 input_codename = "CP932";
4119 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4120 if (p->score & (SCORE_X0212))
4121 input_codename = "EUCJP-MS";
4122 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4123 input_codename = "CP51932";
4124 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4125 if (p->score & (SCORE_KANA))
4126 input_codename = "CP50221";
4127 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4128 input_codename = "CP50220";
4131 return input_codename;
4134 #if !defined(PERL_XS) && !defined(WIN32DLL)
4136 print_guessed_code(char *filename)
4138 if (filename != NULL) printf("%s: ", filename);
4139 if (input_codename && !*input_codename) {
4142 input_codename = get_guessed_code();
4144 printf("%s\n", input_codename);
4148 input_eol == CR ? " (CR)" :
4149 input_eol == LF ? " (LF)" :
4150 input_eol == CRLF ? " (CRLF)" :
4151 input_eol == EOF ? " (MIXED NL)" :
4161 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4163 nkf_char c1, c2, c3;
4169 if (!nkf_isxdigit(c2)){
4174 if (!nkf_isxdigit(c3)){
4179 return (hex2bin(c2) << 4) | hex2bin(c3);
4185 return hex_getc(':', f, i_cgetc, i_cungetc);
4189 cap_ungetc(nkf_char c, FILE *f)
4191 return (*i_cungetc)(c, f);
4197 return hex_getc('%', f, i_ugetc, i_uungetc);
4201 url_ungetc(nkf_char c, FILE *f)
4203 return (*i_uungetc)(c, f);
4207 #ifdef NUMCHAR_OPTION
4209 numchar_getc(FILE *f)
4211 nkf_char (*g)(FILE *) = i_ngetc;
4212 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4223 if (buf[i] == 'x' || buf[i] == 'X'){
4224 for (j = 0; j < 7; j++){
4226 if (!nkf_isxdigit(buf[i])){
4233 c |= hex2bin(buf[i]);
4236 for (j = 0; j < 8; j++){
4240 if (!nkf_isdigit(buf[i])){
4247 c += hex2bin(buf[i]);
4253 return nkf_char_unicode_new(c);
4263 numchar_ungetc(nkf_char c, FILE *f)
4265 return (*i_nungetc)(c, f);
4269 #ifdef UNICODE_NORMALIZATION
4274 nkf_char (*g)(FILE *f) = i_nfc_getc;
4275 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4276 nkf_buf_t *buf = nkf_state->nfc_buf;
4277 const unsigned char *array;
4278 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4279 nkf_char c = (*g)(f);
4281 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4283 nkf_buf_push(buf, c);
4285 while (lower <= upper) {
4286 int mid = (lower+upper) / 2;
4288 array = normalization_table[mid].nfd;
4289 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4290 if (len >= nkf_buf_length(buf)) {
4294 lower = 1, upper = 0;
4297 nkf_buf_push(buf, c);
4299 if (array[len] != nkf_buf_at(buf, len)) {
4300 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4301 else upper = mid - 1;
4308 array = normalization_table[mid].nfc;
4310 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4311 nkf_buf_push(buf, array[i]);
4315 } while (lower <= upper);
4317 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4318 c = nkf_buf_pop(buf);
4324 nfc_ungetc(nkf_char c, FILE *f)
4326 return (*i_nfc_ungetc)(c, f);
4328 #endif /* UNICODE_NORMALIZATION */
4332 base64decode(nkf_char c)
4337 i = c - 'A'; /* A..Z 0-25 */
4338 } else if (c == '_') {
4339 i = '?' /* 63 */ ; /* _ 63 */
4341 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4343 } else if (c > '/') {
4344 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4345 } else if (c == '+' || c == '-') {
4346 i = '>' /* 62 */ ; /* + and - 62 */
4348 i = '?' /* 63 */ ; /* / 63 */
4356 nkf_char c1, c2, c3, c4, cc;
4357 nkf_char t1, t2, t3, t4, mode, exit_mode;
4358 nkf_char lwsp_count;
4361 nkf_char lwsp_size = 128;
4363 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4364 return mime_input_buf(mime_input_state.top++);
4366 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4367 mime_decode_mode=FALSE;
4368 unswitch_mime_getc();
4369 return (*i_getc)(f);
4372 if (mimebuf_f == FIXED_MIME)
4373 exit_mode = mime_decode_mode;
4376 if (mime_decode_mode == 'Q') {
4377 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4379 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4380 if (c1<=SP || DEL<=c1) {
4381 mime_decode_mode = exit_mode; /* prepare for quit */
4384 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4388 mime_decode_mode = exit_mode; /* prepare for quit */
4389 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4390 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4391 /* end Q encoding */
4392 input_mode = exit_mode;
4394 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4395 while ((c1=(*i_getc)(f))!=EOF) {
4400 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4408 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4409 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4424 lwsp_buf[lwsp_count] = (unsigned char)c1;
4425 if (lwsp_count++>lwsp_size){
4427 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4428 lwsp_buf = lwsp_buf_new;
4434 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4436 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4437 i_ungetc(lwsp_buf[lwsp_count],f);
4440 nkf_xfree(lwsp_buf);
4443 if (c1=='='&&c2<SP) { /* this is soft wrap */
4444 while((c1 = (*i_mgetc)(f)) <=SP) {
4445 if (c1 == EOF) return (EOF);
4447 mime_decode_mode = 'Q'; /* still in MIME */
4448 goto restart_mime_q;
4451 mime_decode_mode = 'Q'; /* still in MIME */
4455 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4456 if (c2<=SP) return c2;
4457 mime_decode_mode = 'Q'; /* still in MIME */
4458 return ((hex2bin(c2)<<4) + hex2bin(c3));
4461 if (mime_decode_mode != 'B') {
4462 mime_decode_mode = FALSE;
4463 return (*i_mgetc)(f);
4467 /* Base64 encoding */
4469 MIME allows line break in the middle of
4470 Base64, but we are very pessimistic in decoding
4471 in unbuf mode because MIME encoded code may broken by
4472 less or editor's control sequence (such as ESC-[-K in unbuffered
4473 mode. ignore incomplete MIME.
4475 mode = mime_decode_mode;
4476 mime_decode_mode = exit_mode; /* prepare for quit */
4478 while ((c1 = (*i_mgetc)(f))<=SP) {
4483 if ((c2 = (*i_mgetc)(f))<=SP) {
4486 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4487 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4490 if ((c1 == '?') && (c2 == '=')) {
4493 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4494 while ((c1=(*i_getc)(f))!=EOF) {
4499 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4507 if ((c1=(*i_getc)(f))!=EOF) {
4511 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4526 lwsp_buf[lwsp_count] = (unsigned char)c1;
4527 if (lwsp_count++>lwsp_size){
4529 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4530 lwsp_buf = lwsp_buf_new;
4536 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4538 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4539 i_ungetc(lwsp_buf[lwsp_count],f);
4542 nkf_xfree(lwsp_buf);
4546 if ((c3 = (*i_mgetc)(f))<=SP) {
4549 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4550 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4554 if ((c4 = (*i_mgetc)(f))<=SP) {
4557 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4558 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4562 mime_decode_mode = mode; /* still in MIME sigh... */
4564 /* BASE 64 decoding */
4566 t1 = 0x3f & base64decode(c1);
4567 t2 = 0x3f & base64decode(c2);
4568 t3 = 0x3f & base64decode(c3);
4569 t4 = 0x3f & base64decode(c4);
4570 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4572 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4573 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4575 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4576 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4578 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4583 return mime_input_buf(mime_input_state.top++);
4586 static const char basis_64[] =
4587 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4589 #define MIMEOUT_BUF_LENGTH 74
4591 char buf[MIMEOUT_BUF_LENGTH+1];
4595 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4598 open_mime(nkf_char mode)
4600 const unsigned char *p;
4603 p = mime_pattern[0];
4604 for(i=0;mime_pattern[i];i++) {
4605 if (mode == mime_encode[i]) {
4606 p = mime_pattern[i];
4610 mimeout_mode = mime_encode_method[i];
4612 if (base64_count>45) {
4613 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4614 (*o_mputc)(mimeout_state.buf[i]);
4617 PUT_NEWLINE((*o_mputc));
4620 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
4624 for (;i<mimeout_state.count;i++) {
4625 if (nkf_isspace(mimeout_state.buf[i])) {
4626 (*o_mputc)(mimeout_state.buf[i]);
4636 j = mimeout_state.count;
4637 mimeout_state.count = 0;
4639 mime_putc(mimeout_state.buf[i]);
4644 mime_prechar(nkf_char c2, nkf_char c1)
4646 if (mimeout_mode > 0){
4648 if (base64_count + mimeout_state.count/3*4> 73){
4649 (*o_base64conv)(EOF,0);
4650 OCONV_NEWLINE((*o_base64conv));
4651 (*o_base64conv)(0,SP);
4655 if (!(c2 == 0 && (c1 == CR || c1 == LF)) &&
4656 base64_count + mimeout_state.count/3*4> 66) {
4657 (*o_base64conv)(EOF,0);
4658 OCONV_NEWLINE((*o_base64conv));
4659 (*o_base64conv)(0,SP);
4665 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4666 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4667 open_mime(output_mode);
4668 (*o_base64conv)(EOF,0);
4669 OCONV_NEWLINE((*o_base64conv));
4670 (*o_base64conv)(0,SP);
4689 switch(mimeout_mode) {
4694 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
4700 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
4705 if (mimeout_mode > 0) {
4706 if (mimeout_f!=FIXED_MIME) {
4708 } else if (mimeout_mode != 'Q')
4714 mimeout_addchar(nkf_char c)
4716 switch(mimeout_mode) {
4721 } else if(!nkf_isalnum(c)) {
4723 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4724 (*o_mputc)(bin2hex((c&0xf)));
4732 nkf_state->mimeout_state=c;
4733 (*o_mputc)(basis_64[c>>2]);
4738 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4739 nkf_state->mimeout_state=c;
4744 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4745 (*o_mputc)(basis_64[c & 0x3F]);
4757 mime_putc(nkf_char c)
4762 if (mimeout_f == FIXED_MIME){
4763 if (mimeout_mode == 'Q'){
4764 if (base64_count > 71){
4765 if (c!=CR && c!=LF) {
4767 PUT_NEWLINE((*o_mputc));
4772 if (base64_count > 71){
4774 PUT_NEWLINE((*o_mputc));
4777 if (c == EOF) { /* c==EOF */
4781 if (c != EOF) { /* c==EOF */
4787 /* mimeout_f != FIXED_MIME */
4789 if (c == EOF) { /* c==EOF */
4790 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4791 j = mimeout_state.count;
4792 mimeout_state.count = 0;
4794 if (mimeout_mode > 0) {
4795 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4797 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4800 mimeout_addchar(mimeout_state.buf[i]);
4804 mimeout_addchar(mimeout_state.buf[i]);
4808 mimeout_addchar(mimeout_state.buf[i]);
4814 mimeout_addchar(mimeout_state.buf[i]);
4820 if (mimeout_state.count > 0){
4821 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4826 if (mimeout_mode=='Q') {
4827 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4828 if (c == CR || c == LF) {
4833 } else if (c <= SP) {
4835 if (base64_count > 70) {
4836 PUT_NEWLINE((*o_mputc));
4839 if (!nkf_isblank(c)) {
4844 if (base64_count > 70) {
4846 PUT_NEWLINE((*o_mputc));
4849 open_mime(output_mode);
4851 if (!nkf_noescape_mime(c)) {
4864 if (mimeout_mode <= 0) {
4865 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
4866 output_mode == UTF_8)) {
4867 if (nkf_isspace(c)) {
4869 if (mimeout_mode == -1) {
4872 if (c==CR || c==LF) {
4874 open_mime(output_mode);
4880 for (i=0;i<mimeout_state.count;i++) {
4881 (*o_mputc)(mimeout_state.buf[i]);
4882 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4893 mimeout_state.buf[0] = (char)c;
4894 mimeout_state.count = 1;
4896 if (base64_count > 1
4897 && base64_count + mimeout_state.count > 76
4898 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4899 static const char *str = "boundary=\"";
4900 static int len = 10;
4903 for (; i < mimeout_state.count - len; ++i) {
4904 if (!strncmp(mimeout_state.buf+i, str, len)) {
4910 if (i == 0 || i == mimeout_state.count - len) {
4911 PUT_NEWLINE((*o_mputc));
4913 if (!nkf_isspace(mimeout_state.buf[0])){
4920 for (j = 0; j <= i; ++j) {
4921 (*o_mputc)(mimeout_state.buf[j]);
4923 PUT_NEWLINE((*o_mputc));
4925 for (; j <= mimeout_state.count; ++j) {
4926 mimeout_state.buf[j - i] = mimeout_state.buf[j];
4928 mimeout_state.count -= i;
4931 mimeout_state.buf[mimeout_state.count++] = (char)c;
4932 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4933 open_mime(output_mode);
4938 if (lastchar==CR || lastchar == LF){
4939 for (i=0;i<mimeout_state.count;i++) {
4940 (*o_mputc)(mimeout_state.buf[i]);
4943 mimeout_state.count = 0;
4946 for (i=0;i<mimeout_state.count-1;i++) {
4947 (*o_mputc)(mimeout_state.buf[i]);
4950 mimeout_state.buf[0] = SP;
4951 mimeout_state.count = 1;
4953 open_mime(output_mode);
4956 /* mimeout_mode == 'B', 1, 2 */
4957 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
4958 output_mode == UTF_8)) {
4959 if (lastchar == CR || lastchar == LF){
4960 if (nkf_isblank(c)) {
4961 for (i=0;i<mimeout_state.count;i++) {
4962 mimeout_addchar(mimeout_state.buf[i]);
4964 mimeout_state.count = 0;
4967 for (i=0;i<mimeout_state.count;i++) {
4968 (*o_mputc)(mimeout_state.buf[i]);
4971 mimeout_state.count = 0;
4973 mimeout_state.buf[mimeout_state.count++] = (char)c;
4976 if (nkf_isspace(c)) {
4977 for (i=0;i<mimeout_state.count;i++) {
4978 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4980 for (i=0;i<mimeout_state.count;i++) {
4981 (*o_mputc)(mimeout_state.buf[i]);
4984 mimeout_state.count = 0;
4987 mimeout_state.buf[mimeout_state.count++] = (char)c;
4988 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4990 for (i=0;i<mimeout_state.count;i++) {
4991 (*o_mputc)(mimeout_state.buf[i]);
4994 mimeout_state.count = 0;
4998 if (mimeout_state.count>0 && SP<c && c!='=') {
4999 mimeout_state.buf[mimeout_state.count++] = (char)c;
5000 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5001 j = mimeout_state.count;
5002 mimeout_state.count = 0;
5004 mimeout_addchar(mimeout_state.buf[i]);
5011 if (mimeout_state.count>0) {
5012 j = mimeout_state.count;
5013 mimeout_state.count = 0;
5015 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5017 mimeout_addchar(mimeout_state.buf[i]);
5023 (*o_mputc)(mimeout_state.buf[i]);
5025 open_mime(output_mode);
5032 base64_conv(nkf_char c2, nkf_char c1)
5034 mime_prechar(c2, c1);
5035 (*o_base64conv)(c2,c1);
5039 typedef struct nkf_iconv_t {
5042 size_t input_buffer_size;
5043 char *output_buffer;
5044 size_t output_buffer_size;
5048 nkf_iconv_new(char *tocode, char *fromcode)
5050 nkf_iconv_t converter;
5052 converter->input_buffer_size = IOBUF_SIZE;
5053 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5054 converter->output_buffer_size = IOBUF_SIZE * 2;
5055 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5056 converter->cd = iconv_open(tocode, fromcode);
5057 if (converter->cd == (iconv_t)-1)
5061 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5064 perror("can't iconv_open");
5070 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5072 size_t invalid = (size_t)0;
5073 char *input_buffer = converter->input_buffer;
5074 size_t input_length = (size_t)0;
5075 char *output_buffer = converter->output_buffer;
5076 size_t output_length = converter->output_buffer_size;
5081 while ((c = (*i_getc)(f)) != EOF) {
5082 input_buffer[input_length++] = c;
5083 if (input_length < converter->input_buffer_size) break;
5087 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5088 while (output_length-- > 0) {
5089 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5091 if (ret == (size_t) - 1) {
5094 if (input_buffer != converter->input_buffer)
5095 memmove(converter->input_buffer, input_buffer, input_length);
5098 converter->output_buffer_size *= 2;
5099 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5100 if (output_buffer == NULL) {
5101 perror("can't realloc");
5104 converter->output_buffer = output_buffer;
5107 perror("can't iconv");
5120 nkf_iconv_close(nkf_iconv_t *convert)
5122 nkf_xfree(converter->inbuf);
5123 nkf_xfree(converter->outbuf);
5124 iconv_close(converter->cd);
5133 struct input_code *p = input_code_list;
5145 mime_f = MIME_DECODE_DEFAULT;
5146 mime_decode_f = FALSE;
5151 x0201_f = X0201_DEFAULT;
5152 iso2022jp_f = FALSE;
5153 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5154 ms_ucs_map_f = UCS_MAP_ASCII;
5156 #ifdef UTF8_INPUT_ENABLE
5157 no_cp932ext_f = FALSE;
5158 no_best_fit_chars_f = FALSE;
5159 encode_fallback = NULL;
5160 unicode_subchar = '?';
5161 input_endian = ENDIAN_BIG;
5163 #ifdef UTF8_OUTPUT_ENABLE
5164 output_bom_f = FALSE;
5165 output_endian = ENDIAN_BIG;
5167 #ifdef UNICODE_NORMALIZATION
5183 #ifdef SHIFTJIS_CP932
5193 for (i = 0; i < 256; i++){
5194 prefix_table[i] = 0;
5198 mimeout_state.count = 0;
5203 fold_preserve_f = FALSE;
5206 kanji_intro = DEFAULT_J;
5207 ascii_intro = DEFAULT_R;
5208 fold_margin = FOLD_MARGIN;
5209 o_zconv = no_connection;
5210 o_fconv = no_connection;
5211 o_eol_conv = no_connection;
5212 o_rot_conv = no_connection;
5213 o_hira_conv = no_connection;
5214 o_base64conv = no_connection;
5215 o_iso2022jp_check_conv = no_connection;
5218 i_ungetc = std_ungetc;
5220 i_bungetc = std_ungetc;
5223 i_mungetc = std_ungetc;
5224 i_mgetc_buf = std_getc;
5225 i_mungetc_buf = std_ungetc;
5226 output_mode = ASCII;
5228 mime_decode_mode = FALSE;
5234 z_prev2=0,z_prev1=0;
5236 iconv_for_check = 0;
5238 input_codename = NULL;
5239 input_encoding = NULL;
5240 output_encoding = NULL;
5248 module_connection(void)
5250 if (input_encoding) set_input_encoding(input_encoding);
5251 if (!output_encoding) {
5252 output_encoding = nkf_default_encoding();
5254 if (!output_encoding) {
5255 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5258 set_output_encoding(output_encoding);
5259 oconv = nkf_enc_to_oconv(output_encoding);
5261 if (nkf_enc_unicode_p(output_encoding))
5262 output_mode = UTF_8;
5264 /* replace continucation module, from output side */
5266 /* output redicrection */
5268 if (noout_f || guess_f){
5275 if (mimeout_f == TRUE) {
5276 o_base64conv = oconv; oconv = base64_conv;
5278 /* base64_count = 0; */
5281 if (eolmode_f || guess_f) {
5282 o_eol_conv = oconv; oconv = eol_conv;
5285 o_rot_conv = oconv; oconv = rot_conv;
5288 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5291 o_hira_conv = oconv; oconv = hira_conv;
5294 o_fconv = oconv; oconv = fold_conv;
5297 if (alpha_f || x0201_f) {
5298 o_zconv = oconv; oconv = z_conv;
5302 i_ungetc = std_ungetc;
5303 /* input redicrection */
5306 i_cgetc = i_getc; i_getc = cap_getc;
5307 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5310 i_ugetc = i_getc; i_getc = url_getc;
5311 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5314 #ifdef NUMCHAR_OPTION
5316 i_ngetc = i_getc; i_getc = numchar_getc;
5317 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5320 #ifdef UNICODE_NORMALIZATION
5322 i_nfc_getc = i_getc; i_getc = nfc_getc;
5323 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5326 if (mime_f && mimebuf_f==FIXED_MIME) {
5327 i_mgetc = i_getc; i_getc = mime_getc;
5328 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5331 i_bgetc = i_getc; i_getc = broken_getc;
5332 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5334 if (input_encoding) {
5335 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5337 set_iconv(FALSE, e_iconv);
5341 struct input_code *p = input_code_list;
5350 Conversion main loop. Code detection only.
5353 #if !defined(PERL_XS) && !defined(WIN32DLL)
5360 module_connection();
5361 while ((c = (*i_getc)(f)) != EOF)
5368 #define NEXT continue /* no output, get next */
5369 #define SKIP c2=0;continue /* no output, get next */
5370 #define MORE c2=c1;continue /* need one more byte */
5371 #define SEND ; /* output c1 and c2, get next */
5372 #define LAST break /* end of loop, go closing */
5373 #define set_input_mode(mode) do { \
5374 input_mode = mode; \
5376 set_input_codename("ISO-2022-JP"); \
5377 debug("ISO-2022-JP"); \
5381 kanji_convert(FILE *f)
5383 nkf_char c1=0, c2=0, c3=0, c4=0;
5384 int shift_mode = 0; /* 0, 1, 2, 3 */
5386 int is_8bit = FALSE;
5388 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5393 output_mode = ASCII;
5395 if (module_connection() < 0) {
5396 #if !defined(PERL_XS) && !defined(WIN32DLL)
5397 fprintf(stderr, "no output encoding given\n");
5403 #ifdef UTF8_INPUT_ENABLE
5404 if(iconv == w_iconv32){
5405 while ((c1 = (*i_getc)(f)) != EOF &&
5406 (c2 = (*i_getc)(f)) != EOF &&
5407 (c3 = (*i_getc)(f)) != EOF &&
5408 (c4 = (*i_getc)(f)) != EOF) {
5409 nkf_iconv_utf_32(c1, c2, c3, c4);
5411 (*i_ungetc)(EOF, f);
5413 else if (iconv == w_iconv16) {
5414 while ((c1 = (*i_getc)(f)) != EOF &&
5415 (c2 = (*i_getc)(f)) != EOF) {
5416 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5417 (c3 = (*i_getc)(f)) != EOF &&
5418 (c4 = (*i_getc)(f)) != EOF) {
5419 nkf_iconv_utf_16(c1, c2, c3, c4);
5422 (*i_ungetc)(EOF, f);
5426 while ((c1 = (*i_getc)(f)) != EOF) {
5427 #ifdef INPUT_CODE_FIX
5428 if (!input_encoding)
5434 /* in case of 8th bit is on */
5435 if (!estab_f&&!mime_decode_mode) {
5436 /* in case of not established yet */
5437 /* It is still ambiguious */
5438 if (h_conv(f, c2, c1)==EOF) {
5446 /* in case of already established */
5448 /* ignore bogus code */
5456 /* 2nd byte of 7 bit code or SJIS */
5460 else if (nkf_char_unicode_p(c1)) {
5466 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5469 }else if (input_codename && input_codename[0] == 'I' &&
5470 0xA1 <= c1 && c1 <= 0xDF) {
5471 /* JIS X 0201 Katakana in 8bit JIS */
5472 c2 = JIS_X_0201_1976_K;
5475 } else if (c1 > DEL) {
5477 if (!estab_f && !iso8859_f) {
5478 /* not established yet */
5480 } else { /* estab_f==TRUE */
5486 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5487 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5489 c2 = JIS_X_0201_1976_K;
5494 /* already established */
5498 } else if (SP < c1 && c1 < DEL) {
5499 /* in case of Roman characters */
5501 /* output 1 shifted byte */
5505 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5506 /* output 1 shifted byte */
5507 c2 = JIS_X_0201_1976_K;
5510 /* look like bogus code */
5513 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5514 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5515 /* in case of Kanji shifted */
5517 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5518 /* Check MIME code */
5519 if ((c1 = (*i_getc)(f)) == EOF) {
5522 } else if (c1 == '?') {
5523 /* =? is mime conversion start sequence */
5524 if(mime_f == STRICT_MIME) {
5525 /* check in real detail */
5526 if (mime_begin_strict(f) == EOF)
5529 } else if (mime_begin(f) == EOF)
5538 /* normal ASCII code */
5541 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5544 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5547 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5548 if ((c1 = (*i_getc)(f)) == EOF) {
5552 else if (c1 == '&') {
5554 if ((c1 = (*i_getc)(f)) == EOF) {
5560 else if (c1 == '$') {
5562 if ((c1 = (*i_getc)(f)) == EOF) {
5563 /* don't send bogus code
5565 (*oconv)(0, '$'); */
5567 } else if (c1 == '@' || c1 == 'B') {
5569 set_input_mode(JIS_X_0208);
5571 } else if (c1 == '(') {
5573 if ((c1 = (*i_getc)(f)) == EOF) {
5574 /* don't send bogus code
5580 } else if (c1 == '@'|| c1 == 'B') {
5582 set_input_mode(JIS_X_0208);
5585 } else if (c1 == 'D'){
5586 set_input_mode(JIS_X_0212);
5588 #endif /* X0212_ENABLE */
5589 } else if (c1 == 'O' || c1 == 'Q'){
5590 set_input_mode(JIS_X_0213_1);
5592 } else if (c1 == 'P'){
5593 set_input_mode(JIS_X_0213_2);
5596 /* could be some special code */
5603 } else if (broken_f&0x2) {
5604 /* accept any ESC-(-x as broken code ... */
5605 input_mode = JIS_X_0208;
5614 } else if (c1 == '(') {
5616 if ((c1 = (*i_getc)(f)) == EOF) {
5617 /* don't send bogus code
5619 (*oconv)(0, '('); */
5622 else if (c1 == 'I') {
5623 /* JIS X 0201 Katakana */
5624 set_input_mode(JIS_X_0201_1976_K);
5627 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5628 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5629 set_input_mode(ASCII);
5632 else if (broken_f&0x2) {
5633 set_input_mode(ASCII);
5642 else if (c1 == '.') {
5644 if ((c1 = (*i_getc)(f)) == EOF) {
5647 else if (c1 == 'A') {
5658 else if (c1 == 'N') {
5661 if (g2 == ISO_8859_1) {
5676 } else if (c1 == ESC && iconv == s_iconv) {
5677 /* ESC in Shift_JIS */
5678 if ((c1 = (*i_getc)(f)) == EOF) {
5681 } else if (c1 == '$') {
5683 if ((c1 = (*i_getc)(f)) == EOF) {
5685 } else if (('E' <= c1 && c1 <= 'G') ||
5686 ('O' <= c1 && c1 <= 'Q')) {
5694 static const nkf_char jphone_emoji_first_table[7] =
5695 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5696 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5697 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5698 while (SP <= c1 && c1 <= 'z') {
5699 (*oconv)(0, c1 + c3);
5700 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5715 } else if (c1 == LF || c1 == CR) {
5717 input_mode = ASCII; set_iconv(FALSE, 0);
5719 } else if (mime_decode_f && !mime_decode_mode){
5721 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5729 } else { /* if (c1 == CR)*/
5730 if ((c1=(*i_getc)(f))!=EOF) {
5734 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5754 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5757 if ((c3 = (*i_getc)(f)) != EOF) {
5760 if ((c4 = (*i_getc)(f)) != EOF) {
5762 (*iconv)(c2, c1, c3|c4);
5767 /* 3 bytes EUC or UTF-8 */
5768 if ((c3 = (*i_getc)(f)) != EOF) {
5770 (*iconv)(c2, c1, c3);
5778 0x7F <= c2 && c2 <= 0x92 &&
5779 0x21 <= c1 && c1 <= 0x7E) {
5781 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5784 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5788 (*oconv)(PREFIX_EUCG3 | c2, c1);
5790 #endif /* X0212_ENABLE */
5792 (*oconv)(PREFIX_EUCG3 | c2, c1);
5795 (*oconv)(input_mode, c1); /* other special case */
5801 /* goto next_word */
5805 (*iconv)(EOF, 0, 0);
5806 if (!input_codename)
5809 struct input_code *p = input_code_list;
5810 struct input_code *result = p;
5812 if (p->score < result->score) result = p;
5815 set_input_codename(result->name);
5817 debug(result->name);
5825 * int options(unsigned char *cp)
5832 options(unsigned char *cp)
5836 unsigned char *cp_back = NULL;
5841 while(*cp && *cp++!='-');
5842 while (*cp || cp_back) {
5850 case '-': /* literal options */
5851 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5855 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5856 p = (unsigned char *)long_option[i].name;
5857 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5858 if (*p == cp[j] || cp[j] == SP){
5865 #if !defined(PERL_XS) && !defined(WIN32DLL)
5866 fprintf(stderr, "unknown long option: --%s\n", cp);
5870 while(*cp && *cp != SP && cp++);
5871 if (long_option[i].alias[0]){
5873 cp = (unsigned char *)long_option[i].alias;
5876 if (strcmp(long_option[i].name, "help") == 0){
5881 if (strcmp(long_option[i].name, "ic=") == 0){
5882 enc = nkf_enc_find((char *)p);
5884 input_encoding = enc;
5887 if (strcmp(long_option[i].name, "oc=") == 0){
5888 enc = nkf_enc_find((char *)p);
5889 /* if (enc <= 0) continue; */
5891 output_encoding = enc;
5894 if (strcmp(long_option[i].name, "guess=") == 0){
5895 if (p[0] == '0' || p[0] == '1') {
5903 if (strcmp(long_option[i].name, "overwrite") == 0){
5906 preserve_time_f = TRUE;
5909 if (strcmp(long_option[i].name, "overwrite=") == 0){
5912 preserve_time_f = TRUE;
5914 backup_suffix = (char *)p;
5917 if (strcmp(long_option[i].name, "in-place") == 0){
5920 preserve_time_f = FALSE;
5923 if (strcmp(long_option[i].name, "in-place=") == 0){
5926 preserve_time_f = FALSE;
5928 backup_suffix = (char *)p;
5933 if (strcmp(long_option[i].name, "cap-input") == 0){
5937 if (strcmp(long_option[i].name, "url-input") == 0){
5942 #ifdef NUMCHAR_OPTION
5943 if (strcmp(long_option[i].name, "numchar-input") == 0){
5949 if (strcmp(long_option[i].name, "no-output") == 0){
5953 if (strcmp(long_option[i].name, "debug") == 0){
5958 if (strcmp(long_option[i].name, "cp932") == 0){
5959 #ifdef SHIFTJIS_CP932
5963 #ifdef UTF8_OUTPUT_ENABLE
5964 ms_ucs_map_f = UCS_MAP_CP932;
5968 if (strcmp(long_option[i].name, "no-cp932") == 0){
5969 #ifdef SHIFTJIS_CP932
5973 #ifdef UTF8_OUTPUT_ENABLE
5974 ms_ucs_map_f = UCS_MAP_ASCII;
5978 #ifdef SHIFTJIS_CP932
5979 if (strcmp(long_option[i].name, "cp932inv") == 0){
5986 if (strcmp(long_option[i].name, "x0212") == 0){
5993 if (strcmp(long_option[i].name, "exec-in") == 0){
5997 if (strcmp(long_option[i].name, "exec-out") == 0){
6002 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
6003 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
6004 no_cp932ext_f = TRUE;
6007 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
6008 no_best_fit_chars_f = TRUE;
6011 if (strcmp(long_option[i].name, "fb-skip") == 0){
6012 encode_fallback = NULL;
6015 if (strcmp(long_option[i].name, "fb-html") == 0){
6016 encode_fallback = encode_fallback_html;
6019 if (strcmp(long_option[i].name, "fb-xml") == 0){
6020 encode_fallback = encode_fallback_xml;
6023 if (strcmp(long_option[i].name, "fb-java") == 0){
6024 encode_fallback = encode_fallback_java;
6027 if (strcmp(long_option[i].name, "fb-perl") == 0){
6028 encode_fallback = encode_fallback_perl;
6031 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6032 encode_fallback = encode_fallback_subchar;
6035 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6036 encode_fallback = encode_fallback_subchar;
6037 unicode_subchar = 0;
6039 /* decimal number */
6040 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6041 unicode_subchar *= 10;
6042 unicode_subchar += hex2bin(p[i]);
6044 }else if(p[1] == 'x' || p[1] == 'X'){
6045 /* hexadecimal number */
6046 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6047 unicode_subchar <<= 4;
6048 unicode_subchar |= hex2bin(p[i]);
6052 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6053 unicode_subchar *= 8;
6054 unicode_subchar += hex2bin(p[i]);
6057 w16e_conv(unicode_subchar, &i, &j);
6058 unicode_subchar = i<<8 | j;
6062 #ifdef UTF8_OUTPUT_ENABLE
6063 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6064 ms_ucs_map_f = UCS_MAP_MS;
6068 #ifdef UNICODE_NORMALIZATION
6069 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6074 if (strcmp(long_option[i].name, "prefix=") == 0){
6075 if (nkf_isgraph(p[0])){
6076 for (i = 1; nkf_isgraph(p[i]); i++){
6077 prefix_table[p[i]] = p[0];
6082 #if !defined(PERL_XS) && !defined(WIN32DLL)
6083 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6088 case 'b': /* buffered mode */
6091 case 'u': /* non bufferd mode */
6094 case 't': /* transparent mode */
6099 } else if (*cp=='2') {
6103 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6111 case 'j': /* JIS output */
6113 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6115 case 'e': /* AT&T EUC output */
6116 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6118 case 's': /* SJIS output */
6119 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6121 case 'l': /* ISO8859 Latin-1 support, no conversion */
6122 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6123 input_encoding = nkf_enc_from_index(ISO_8859_1);
6125 case 'i': /* Kanji IN ESC-$-@/B */
6126 if (*cp=='@'||*cp=='B')
6127 kanji_intro = *cp++;
6129 case 'o': /* ASCII IN ESC-(-J/B/H */
6130 /* ESC ( H was used in initial JUNET messages */
6131 if (*cp=='J'||*cp=='B'||*cp=='H')
6132 ascii_intro = *cp++;
6136 bit:1 katakana->hiragana
6137 bit:2 hiragana->katakana
6139 if ('9'>= *cp && *cp>='0')
6140 hira_f |= (*cp++ -'0');
6147 #if defined(MSDOS) || defined(__OS2__)
6154 show_configuration();
6162 #ifdef UTF8_OUTPUT_ENABLE
6163 case 'w': /* UTF-8 output */
6168 output_encoding = nkf_enc_from_index(UTF_8N);
6170 output_bom_f = TRUE;
6171 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6175 if ('1'== cp[0] && '6'==cp[1]) {
6178 } else if ('3'== cp[0] && '2'==cp[1]) {
6182 output_encoding = nkf_enc_from_index(UTF_8);
6187 output_endian = ENDIAN_LITTLE;
6188 } else if (cp[0] == 'B') {
6193 enc_idx = enc_idx == UTF_16
6194 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6195 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6197 output_bom_f = TRUE;
6198 enc_idx = enc_idx == UTF_16
6199 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6200 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6202 output_encoding = nkf_enc_from_index(enc_idx);
6206 #ifdef UTF8_INPUT_ENABLE
6207 case 'W': /* UTF input */
6210 input_encoding = nkf_enc_from_index(UTF_8);
6213 if ('1'== cp[0] && '6'==cp[1]) {
6215 input_endian = ENDIAN_BIG;
6217 } else if ('3'== cp[0] && '2'==cp[1]) {
6219 input_endian = ENDIAN_BIG;
6222 input_encoding = nkf_enc_from_index(UTF_8);
6227 input_endian = ENDIAN_LITTLE;
6228 } else if (cp[0] == 'B') {
6230 input_endian = ENDIAN_BIG;
6232 enc_idx = (enc_idx == UTF_16
6233 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6234 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6235 input_encoding = nkf_enc_from_index(enc_idx);
6239 /* Input code assumption */
6240 case 'J': /* ISO-2022-JP input */
6241 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6243 case 'E': /* EUC-JP input */
6244 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6246 case 'S': /* Shift_JIS input */
6247 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6249 case 'Z': /* Convert X0208 alphabet to asii */
6251 bit:0 Convert JIS X 0208 Alphabet to ASCII
6252 bit:1 Convert Kankaku to one space
6253 bit:2 Convert Kankaku to two spaces
6254 bit:3 Convert HTML Entity
6255 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6257 while ('0'<= *cp && *cp <='4') {
6258 alpha_f |= 1 << (*cp++ - '0');
6262 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6263 x0201_f = FALSE; /* No X0201->X0208 conversion */
6265 ESC-(-I in JIS, EUC, MS Kanji
6266 SI/SO in JIS, EUC, MS Kanji
6267 SS2 in EUC, JIS, not in MS Kanji
6268 MS Kanji (0xa0-0xdf)
6270 ESC-(-I in JIS (0x20-0x5f)
6271 SS2 in EUC (0xa0-0xdf)
6272 0xa0-0xd in MS Kanji (0xa0-0xdf)
6275 case 'X': /* Convert X0201 kana to X0208 */
6278 case 'F': /* prserve new lines */
6279 fold_preserve_f = TRUE;
6280 case 'f': /* folding -f60 or -f */
6283 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6285 fold_len += *cp++ - '0';
6287 if (!(0<fold_len && fold_len<BUFSIZ))
6288 fold_len = DEFAULT_FOLD;
6292 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6294 fold_margin += *cp++ - '0';
6298 case 'm': /* MIME support */
6299 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6300 if (*cp=='B'||*cp=='Q') {
6301 mime_decode_mode = *cp++;
6302 mimebuf_f = FIXED_MIME;
6303 } else if (*cp=='N') {
6304 mime_f = TRUE; cp++;
6305 } else if (*cp=='S') {
6306 mime_f = STRICT_MIME; cp++;
6307 } else if (*cp=='0') {
6308 mime_decode_f = FALSE;
6309 mime_f = FALSE; cp++;
6311 mime_f = STRICT_MIME;
6314 case 'M': /* MIME output */
6317 mimeout_f = FIXED_MIME; cp++;
6318 } else if (*cp=='Q') {
6320 mimeout_f = FIXED_MIME; cp++;
6325 case 'B': /* Broken JIS support */
6327 bit:1 allow any x on ESC-(-x or ESC-$-x
6328 bit:2 reset to ascii on NL
6330 if ('9'>= *cp && *cp>='0')
6331 broken_f |= 1<<(*cp++ -'0');
6336 case 'O':/* for Output file */
6340 case 'c':/* add cr code */
6343 case 'd':/* delete cr code */
6346 case 'I': /* ISO-2022-JP output */
6349 case 'L': /* line mode */
6350 if (*cp=='u') { /* unix */
6351 eolmode_f = LF; cp++;
6352 } else if (*cp=='m') { /* mac */
6353 eolmode_f = CR; cp++;
6354 } else if (*cp=='w') { /* windows */
6355 eolmode_f = CRLF; cp++;
6356 } else if (*cp=='0') { /* no conversion */
6357 eolmode_f = 0; cp++;
6362 if ('2' <= *cp && *cp <= '9') {
6365 } else if (*cp == '0' || *cp == '1') {
6374 /* module muliple options in a string are allowed for Perl moudle */
6375 while(*cp && *cp++!='-');
6378 #if !defined(PERL_XS) && !defined(WIN32DLL)
6379 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6381 /* bogus option but ignored */
6389 #include "nkf32dll.c"
6390 #elif defined(PERL_XS)
6391 #else /* WIN32DLL */
6393 main(int argc, char **argv)
6398 char *outfname = NULL;
6401 #ifdef EASYWIN /*Easy Win */
6402 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6404 #ifdef DEFAULT_CODE_LOCALE
6405 setlocale(LC_CTYPE, "");
6409 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6410 cp = (unsigned char *)*argv;
6415 if (pipe(fds) < 0 || (pid = fork()) < 0){
6426 execvp(argv[1], &argv[1]);
6443 int debug_f_back = debug_f;
6446 int exec_f_back = exec_f;
6449 int x0212_f_back = x0212_f;
6451 int x0213_f_back = x0213_f;
6452 int guess_f_back = guess_f;
6454 guess_f = guess_f_back;
6457 debug_f = debug_f_back;
6460 exec_f = exec_f_back;
6462 x0212_f = x0212_f_back;
6463 x0213_f = x0213_f_back;
6466 if (binmode_f == TRUE)
6467 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6468 if (freopen("","wb",stdout) == NULL)
6475 setbuf(stdout, (char *) NULL);
6477 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6480 if (binmode_f == TRUE)
6481 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6482 if (freopen("","rb",stdin) == NULL) return (-1);
6486 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6490 kanji_convert(stdin);
6491 if (guess_f) print_guessed_code(NULL);
6495 int is_argument_error = FALSE;
6497 input_codename = NULL;
6500 iconv_for_check = 0;
6502 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6504 is_argument_error = TRUE;
6512 /* reopen file for stdout */
6513 if (file_out_f == TRUE) {
6516 outfname = nkf_xmalloc(strlen(origfname)
6517 + strlen(".nkftmpXXXXXX")
6519 strcpy(outfname, origfname);
6523 for (i = strlen(outfname); i; --i){
6524 if (outfname[i - 1] == '/'
6525 || outfname[i - 1] == '\\'){
6531 strcat(outfname, "ntXXXXXX");
6533 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6534 S_IREAD | S_IWRITE);
6536 strcat(outfname, ".nkftmpXXXXXX");
6537 fd = mkstemp(outfname);
6540 || (fd_backup = dup(fileno(stdout))) < 0
6541 || dup2(fd, fileno(stdout)) < 0
6552 outfname = "nkf.out";
6555 if(freopen(outfname, "w", stdout) == NULL) {
6559 if (binmode_f == TRUE) {
6560 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6561 if (freopen("","wb",stdout) == NULL)
6568 if (binmode_f == TRUE)
6569 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6570 if (freopen("","rb",fin) == NULL)
6575 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6579 char *filename = NULL;
6581 if (nfiles > 1) filename = origfname;
6582 if (guess_f) print_guessed_code(filename);
6588 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6596 if (dup2(fd_backup, fileno(stdout)) < 0){
6599 if (stat(origfname, &sb)) {
6600 fprintf(stderr, "Can't stat %s\n", origfname);
6602 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6603 if (chmod(outfname, sb.st_mode)) {
6604 fprintf(stderr, "Can't set permission %s\n", outfname);
6607 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6608 if(preserve_time_f){
6609 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6610 tb[0] = tb[1] = sb.st_mtime;
6611 if (utime(outfname, tb)) {
6612 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6615 tb.actime = sb.st_atime;
6616 tb.modtime = sb.st_mtime;
6617 if (utime(outfname, &tb)) {
6618 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6623 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6625 unlink(backup_filename);
6627 if (rename(origfname, backup_filename)) {
6628 perror(backup_filename);
6629 fprintf(stderr, "Can't rename %s to %s\n",
6630 origfname, backup_filename);
6632 nkf_xfree(backup_filename);
6635 if (unlink(origfname)){
6640 if (rename(outfname, origfname)) {
6642 fprintf(stderr, "Can't rename %s to %s\n",
6643 outfname, origfname);
6645 nkf_xfree(outfname);
6650 if (is_argument_error)
6653 #ifdef EASYWIN /*Easy Win */
6654 if (file_out_f == FALSE)
6655 scanf("%d",&end_check);
6658 #else /* for Other OS */
6659 if (file_out_f == TRUE)
6661 #endif /*Easy Win */
6664 #endif /* WIN32DLL */