2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2009, The nkf Project.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 #define NKF_VERSION "2.0.8"
24 #define NKF_RELEASE_DATE "2009-01-19"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2009, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
213 {"ISO-2022-JP", ISO_2022_JP},
214 {"ISO2022JP-CP932", CP50220},
215 {"CP50220", CP50220},
216 {"CP50221", CP50221},
217 {"CSISO2022JP", CP50221},
218 {"CP50222", CP50222},
219 {"ISO-2022-JP-1", ISO_2022_JP_1},
220 {"ISO-2022-JP-3", ISO_2022_JP_3},
221 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
222 {"SHIFT_JIS", SHIFT_JIS},
224 {"WINDOWS-31J", WINDOWS_31J},
225 {"CSWINDOWS31J", WINDOWS_31J},
226 {"CP932", WINDOWS_31J},
227 {"MS932", WINDOWS_31J},
228 {"CP10001", CP10001},
231 {"EUCJP-NKF", EUCJP_NKF},
232 {"CP51932", CP51932},
233 {"EUC-JP-MS", EUCJP_MS},
234 {"EUCJP-MS", EUCJP_MS},
235 {"EUCJPMS", EUCJP_MS},
236 {"EUC-JP-ASCII", EUCJP_ASCII},
237 {"EUCJP-ASCII", EUCJP_ASCII},
238 {"SHIFT_JISX0213", SHIFT_JISX0213},
239 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
240 {"EUC-JISX0213", EUC_JISX0213},
241 {"EUC-JIS-2004", EUC_JIS_2004},
244 {"UTF-8-BOM", UTF_8_BOM},
245 {"UTF8-MAC", UTF8_MAC},
246 {"UTF-8-MAC", UTF8_MAC},
248 {"UTF-16BE", UTF_16BE},
249 {"UTF-16BE-BOM", UTF_16BE_BOM},
250 {"UTF-16LE", UTF_16LE},
251 {"UTF-16LE-BOM", UTF_16LE_BOM},
253 {"UTF-32BE", UTF_32BE},
254 {"UTF-32BE-BOM", UTF_32BE_BOM},
255 {"UTF-32LE", UTF_32LE},
256 {"UTF-32LE-BOM", UTF_32LE_BOM},
261 #if defined(DEFAULT_CODE_JIS)
262 #define DEFAULT_ENCIDX ISO_2022_JP
263 #elif defined(DEFAULT_CODE_SJIS)
264 #define DEFAULT_ENCIDX SHIFT_JIS
265 #elif defined(DEFAULT_CODE_WINDOWS_31J)
266 #define DEFAULT_ENCIDX WINDOWS_31J
267 #elif defined(DEFAULT_CODE_EUC)
268 #define DEFAULT_ENCIDX EUC_JP
269 #elif defined(DEFAULT_CODE_UTF8)
270 #define DEFAULT_ENCIDX UTF_8
274 #define is_alnum(c) \
275 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
277 /* I don't trust portablity of toupper */
278 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
279 #define nkf_isoctal(c) ('0'<=c && c<='7')
280 #define nkf_isdigit(c) ('0'<=c && c<='9')
281 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
282 #define nkf_isblank(c) (c == SP || c == TAB)
283 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
284 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
285 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
286 #define nkf_isprint(c) (SP<=c && c<='~')
287 #define nkf_isgraph(c) ('!'<=c && c<='~')
288 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
289 ('A'<=c&&c<='F') ? (c-'A'+10) : \
290 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
291 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
292 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
293 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
294 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
295 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
297 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
298 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
300 #define HOLD_SIZE 1024
301 #if defined(INT_IS_SHORT)
302 #define IOBUF_SIZE 2048
304 #define IOBUF_SIZE 16384
307 #define DEFAULT_J 'B'
308 #define DEFAULT_R 'B'
315 /* MIME preprocessor */
317 #ifdef EASYWIN /*Easy Win */
318 extern POINT _BufferSize;
327 void (*status_func)(struct input_code *, nkf_char);
328 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
332 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
333 static nkf_encoding *input_encoding = NULL;
334 static nkf_encoding *output_encoding = NULL;
336 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
338 * 0: Shift_JIS, eucJP-ascii
343 #define UCS_MAP_ASCII 0
345 #define UCS_MAP_CP932 2
346 #define UCS_MAP_CP10001 3
347 static int ms_ucs_map_f = UCS_MAP_ASCII;
349 #ifdef UTF8_INPUT_ENABLE
350 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
351 static int no_cp932ext_f = FALSE;
352 /* ignore ZERO WIDTH NO-BREAK SPACE */
353 static int no_best_fit_chars_f = FALSE;
354 static int input_endian = ENDIAN_BIG;
355 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
356 static void (*encode_fallback)(nkf_char c) = NULL;
357 static void w_status(struct input_code *, nkf_char);
359 #ifdef UTF8_OUTPUT_ENABLE
360 static int output_bom_f = FALSE;
361 static int output_endian = ENDIAN_BIG;
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_getc(FILE *f);
373 static void mime_putc(nkf_char c);
377 #if !defined(PERL_XS) && !defined(WIN32DLL)
378 static unsigned char stdibuf[IOBUF_SIZE];
379 static unsigned char stdobuf[IOBUF_SIZE];
383 static int unbuf_f = FALSE;
384 static int estab_f = FALSE;
385 static int nop_f = FALSE;
386 static int binmode_f = TRUE; /* binary mode */
387 static int rot_f = FALSE; /* rot14/43 mode */
388 static int hira_f = FALSE; /* hira/kata henkan */
389 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
390 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
391 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
392 static int mimebuf_f = FALSE; /* MIME buffered input */
393 static int broken_f = FALSE; /* convert ESC-less broken JIS */
394 static int iso8859_f = FALSE; /* ISO8859 through */
395 static int mimeout_f = FALSE; /* base64 mode */
396 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
397 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
399 #ifdef UNICODE_NORMALIZATION
400 static int nfc_f = FALSE;
401 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
402 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
406 static int cap_f = FALSE;
407 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
408 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
410 static int url_f = FALSE;
411 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
412 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
415 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
416 #define CLASS_MASK NKF_INT32_C(0xFF000000)
417 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
418 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
419 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
420 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
421 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
422 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
423 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
424 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
425 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
427 #ifdef NUMCHAR_OPTION
428 static int numchar_f = FALSE;
429 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
430 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
434 static int noout_f = FALSE;
435 static void no_putc(nkf_char c);
436 static int debug_f = FALSE;
437 static void debug(const char *str);
438 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
441 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
442 static void set_input_codename(const char *codename);
445 static int exec_f = 0;
448 #ifdef SHIFTJIS_CP932
449 /* invert IBM extended characters to others */
450 static int cp51932_f = FALSE;
452 /* invert NEC-selected IBM extended characters to IBM extended characters */
453 static int cp932inv_f = TRUE;
455 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
456 #endif /* SHIFTJIS_CP932 */
458 static int x0212_f = FALSE;
459 static int x0213_f = FALSE;
461 static unsigned char prefix_table[256];
463 static void e_status(struct input_code *, nkf_char);
464 static void s_status(struct input_code *, nkf_char);
466 struct input_code input_code_list[] = {
467 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
468 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
469 #ifdef UTF8_INPUT_ENABLE
470 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
475 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
476 static int base64_count = 0;
478 /* X0208 -> ASCII converter */
481 static int f_line = 0; /* chars in line */
482 static int f_prev = 0;
483 static int fold_preserve_f = FALSE; /* preserve new lines */
484 static int fold_f = FALSE;
485 static int fold_len = 0;
488 static unsigned char kanji_intro = DEFAULT_J;
489 static unsigned char ascii_intro = DEFAULT_R;
493 #define FOLD_MARGIN 10
494 #define DEFAULT_FOLD 60
496 static int fold_margin = FOLD_MARGIN;
498 /* process default */
501 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
503 fprintf(stderr,"nkf internal module connection failure.\n");
509 no_connection(nkf_char c2, nkf_char c1)
511 no_connection2(c2,c1,0);
514 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
515 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
517 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
518 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
519 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
520 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
521 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
522 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
523 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
525 /* static redirections */
527 static void (*o_putc)(nkf_char c) = std_putc;
529 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
530 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
532 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
533 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
535 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
537 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
538 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
540 /* for strict mime */
541 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
542 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
545 static int output_mode = ASCII; /* output kanji mode */
546 static int input_mode = ASCII; /* input kanji mode */
547 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
549 /* X0201 / X0208 conversion tables */
551 /* X0201 kana conversion table */
553 static const unsigned char cv[]= {
554 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
555 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
556 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
557 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
558 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
559 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
560 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
561 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
562 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
563 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
564 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
565 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
566 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
567 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
568 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
569 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
573 /* X0201 kana conversion table for daguten */
575 static const unsigned char dv[]= {
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
580 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
581 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
582 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
583 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
584 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
585 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
586 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
587 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
594 /* X0201 kana conversion table for han-daguten */
596 static const unsigned char ev[]= {
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
608 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 /* X0208 kigou conversion table */
617 /* 0x8140 - 0x819e */
618 static const unsigned char fv[] = {
620 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
621 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
622 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
624 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
625 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
626 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
627 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
628 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
630 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
636 static int option_mode = 0;
637 static int file_out_f = FALSE;
639 static int overwrite_f = FALSE;
640 static int preserve_time_f = FALSE;
641 static int backup_f = FALSE;
642 static char *backup_suffix = "";
645 static int eolmode_f = 0; /* CR, LF, CRLF */
646 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
647 static nkf_char prev_cr = 0; /* CR or 0 */
648 #ifdef EASYWIN /*Easy Win */
649 static int end_check;
653 nkf_xmalloc(size_t size)
657 if (size == 0) size = 1;
661 perror("can't malloc");
669 nkf_xrealloc(void *ptr, size_t size)
671 if (size == 0) size = 1;
673 ptr = realloc(ptr, size);
675 perror("can't realloc");
682 #define nkf_xfree(ptr) free(ptr)
685 nkf_str_caseeql(const char *src, const char *target)
688 for (i = 0; src[i] && target[i]; i++) {
689 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
691 if (src[i] || target[i]) return FALSE;
696 nkf_enc_from_index(int idx)
698 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
701 return &nkf_encoding_table[idx];
705 nkf_enc_find_index(const char *name)
708 if (name[0] == 'X' && *(name+1) == '-') name += 2;
709 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
710 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
711 return encoding_name_to_id_table[i].id;
718 nkf_enc_find(const char *name)
721 idx = nkf_enc_find_index(name);
722 if (idx < 0) return 0;
723 return nkf_enc_from_index(idx);
726 #define nkf_enc_name(enc) (enc)->name
727 #define nkf_enc_to_index(enc) (enc)->id
728 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
729 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
730 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
731 #define nkf_enc_asciicompat(enc) (\
732 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
733 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
734 #define nkf_enc_unicode_p(enc) (\
735 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
736 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
737 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
738 #define nkf_enc_cp5022x_p(enc) (\
739 nkf_enc_to_index(enc) == CP50220 ||\
740 nkf_enc_to_index(enc) == CP50221 ||\
741 nkf_enc_to_index(enc) == CP50222)
743 #ifdef DEFAULT_CODE_LOCALE
747 #ifdef HAVE_LANGINFO_H
748 return nl_langinfo(CODESET);
749 #elif defined(__WIN32__)
751 sprintf(buf, "CP%d", GetACP());
753 #elif defined(__OS2__)
754 # if defined(INT_IS_SHORT)
760 ULONG ulCP[1], ulncp;
761 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
762 if (ulCP[0] == 932 || ulCP[0] == 943)
763 strcpy(buf, "Shift_JIS");
765 sprintf(buf, "CP%lu", ulCP[0]);
773 nkf_locale_encoding()
775 nkf_encoding *enc = 0;
776 const char *encname = nkf_locale_charmap();
778 enc = nkf_enc_find(encname);
781 #endif /* DEFAULT_CODE_LOCALE */
786 return &nkf_encoding_table[UTF_8];
790 nkf_default_encoding()
792 nkf_encoding *enc = 0;
793 #ifdef DEFAULT_CODE_LOCALE
794 enc = nkf_locale_encoding();
795 #elif defined(DEFAULT_ENCIDX)
796 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
798 if (!enc) enc = nkf_utf8_encoding();
809 nkf_buf_new(int length)
811 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
812 buf->ptr = nkf_xmalloc(length);
820 nkf_buf_dispose(nkf_buf_t *buf)
827 #define nkf_buf_length(buf) ((buf)->len)
828 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
831 nkf_buf_at(nkf_buf_t *buf, int index)
833 assert(index <= buf->len);
834 return buf->ptr[index];
838 nkf_buf_clear(nkf_buf_t *buf)
844 nkf_buf_push(nkf_buf_t *buf, unsigned char c)
846 if (buf->capa <= buf->len) {
849 buf->ptr[buf->len++] = c;
853 nkf_buf_pop(nkf_buf_t *buf)
855 assert(!nkf_buf_empty_p(buf));
856 return buf->ptr[--buf->len];
859 /* Normalization Form C */
862 #define fprintf dllprintf
868 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
875 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
876 " j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
877 #ifdef UTF8_OUTPUT_ENABLE
878 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
880 " J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
881 #ifdef UTF8_INPUT_ENABLE
882 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
886 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
887 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
888 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
891 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
892 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
893 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
894 " X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
897 " O Output to File (DEFAULT 'nkf.out')\n"
898 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
901 "Long name options\n"
902 " --ic=<input codeset> --oc=<output codeset>\n"
903 " Specify the input or output codeset\n"
904 " --hiragana --katakana --katakana-hiragana\n"
905 " To Hiragana/Katakana Conversion\n"
909 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
911 #ifdef NUMCHAR_OPTION
912 " --numchar-input Convert Unicode Character Reference\n"
914 #ifdef UTF8_INPUT_ENABLE
915 " --fb-{skip, html, xml, perl, java, subchar}\n"
916 " Specify how nkf handles unassigned characters\n"
921 " --in-place[=SUF] Overwrite original listed files by filtered result\n"
922 " --overwrite[=SUF] in-place and preserve timestamp of original files\n"
924 " -g --guess Guess the input code\n"
925 " -v --version print the version\n"
926 " --help/-V print this help / configuration\n"
932 show_configuration(void)
935 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
936 " Compile-time options:\n"
937 " Compiled at: " __DATE__ " " __TIME__ "\n"
940 " Default output encoding: "
941 #ifdef DEFAULT_CODE_LOCALE
942 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
943 #elif defined(DEFAULT_ENCIDX)
944 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
950 " Default output end of line: "
951 #if DEFAULT_NEWLINE == CR
953 #elif DEFAULT_NEWLINE == CRLF
959 " Decode MIME encoded string: "
960 #if MIME_DECODE_DEFAULT
966 " Convert JIS X 0201 Katakana: "
973 " --help, --version output: "
974 #if HELP_OUTPUT_HELP_OUTPUT
985 get_backup_filename(const char *suffix, const char *filename)
987 char *backup_filename;
988 int asterisk_count = 0;
990 int filename_length = strlen(filename);
992 for(i = 0; suffix[i]; i++){
993 if(suffix[i] == '*') asterisk_count++;
997 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
998 for(i = 0, j = 0; suffix[i];){
999 if(suffix[i] == '*'){
1000 backup_filename[j] = '\0';
1001 strncat(backup_filename, filename, filename_length);
1003 j += filename_length;
1005 backup_filename[j++] = suffix[i++];
1008 backup_filename[j] = '\0';
1010 j = filename_length + strlen(suffix);
1011 backup_filename = nkf_xmalloc(j + 1);
1012 strcpy(backup_filename, filename);
1013 strcat(backup_filename, suffix);
1014 backup_filename[j] = '\0';
1016 return backup_filename;
1020 #ifdef UTF8_INPUT_ENABLE
1022 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1029 (*f)(0, bin2hex(c>>shift));
1040 encode_fallback_html(nkf_char c)
1045 if(c >= NKF_INT32_C(1000000))
1046 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1047 if(c >= NKF_INT32_C(100000))
1048 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1050 (*oconv)(0, 0x30+(c/10000 )%10);
1052 (*oconv)(0, 0x30+(c/1000 )%10);
1054 (*oconv)(0, 0x30+(c/100 )%10);
1056 (*oconv)(0, 0x30+(c/10 )%10);
1058 (*oconv)(0, 0x30+ c %10);
1064 encode_fallback_xml(nkf_char c)
1069 nkf_each_char_to_hex(oconv, c);
1075 encode_fallback_java(nkf_char c)
1079 if(!nkf_char_unicode_bmp_p(c)){
1083 (*oconv)(0, bin2hex(c>>20));
1084 (*oconv)(0, bin2hex(c>>16));
1088 (*oconv)(0, bin2hex(c>>12));
1089 (*oconv)(0, bin2hex(c>> 8));
1090 (*oconv)(0, bin2hex(c>> 4));
1091 (*oconv)(0, bin2hex(c ));
1096 encode_fallback_perl(nkf_char c)
1101 nkf_each_char_to_hex(oconv, c);
1107 encode_fallback_subchar(nkf_char c)
1109 c = unicode_subchar;
1110 (*oconv)((c>>8)&0xFF, c&0xFF);
1115 static const struct {
1139 {"katakana-hiragana","h3"},
1147 #ifdef UTF8_OUTPUT_ENABLE
1157 {"fb-subchar=", ""},
1159 #ifdef UTF8_INPUT_ENABLE
1160 {"utf8-input", "W"},
1161 {"utf16-input", "W16"},
1162 {"no-cp932ext", ""},
1163 {"no-best-fit-chars",""},
1165 #ifdef UNICODE_NORMALIZATION
1166 {"utf8mac-input", ""},
1178 #ifdef NUMCHAR_OPTION
1179 {"numchar-input", ""},
1185 #ifdef SHIFTJIS_CP932
1196 set_input_encoding(nkf_encoding *enc)
1198 switch (nkf_enc_to_index(enc)) {
1205 #ifdef SHIFTJIS_CP932
1208 #ifdef UTF8_OUTPUT_ENABLE
1209 ms_ucs_map_f = UCS_MAP_CP932;
1219 case ISO_2022_JP_2004:
1226 #ifdef SHIFTJIS_CP932
1229 #ifdef UTF8_OUTPUT_ENABLE
1230 ms_ucs_map_f = UCS_MAP_CP932;
1235 #ifdef SHIFTJIS_CP932
1238 #ifdef UTF8_OUTPUT_ENABLE
1239 ms_ucs_map_f = UCS_MAP_CP10001;
1247 #ifdef SHIFTJIS_CP932
1250 #ifdef UTF8_OUTPUT_ENABLE
1251 ms_ucs_map_f = UCS_MAP_CP932;
1255 #ifdef SHIFTJIS_CP932
1258 #ifdef UTF8_OUTPUT_ENABLE
1259 ms_ucs_map_f = UCS_MAP_MS;
1263 #ifdef SHIFTJIS_CP932
1266 #ifdef UTF8_OUTPUT_ENABLE
1267 ms_ucs_map_f = UCS_MAP_ASCII;
1270 case SHIFT_JISX0213:
1271 case SHIFT_JIS_2004:
1273 #ifdef SHIFTJIS_CP932
1280 #ifdef SHIFTJIS_CP932
1284 #ifdef UTF8_INPUT_ENABLE
1285 #ifdef UNICODE_NORMALIZATION
1293 input_endian = ENDIAN_BIG;
1297 input_endian = ENDIAN_LITTLE;
1302 input_endian = ENDIAN_BIG;
1306 input_endian = ENDIAN_LITTLE;
1313 set_output_encoding(nkf_encoding *enc)
1315 switch (nkf_enc_to_index(enc)) {
1318 #ifdef SHIFTJIS_CP932
1319 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1321 #ifdef UTF8_OUTPUT_ENABLE
1322 ms_ucs_map_f = UCS_MAP_CP932;
1326 #ifdef SHIFTJIS_CP932
1327 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1329 #ifdef UTF8_OUTPUT_ENABLE
1330 ms_ucs_map_f = UCS_MAP_CP932;
1335 #ifdef SHIFTJIS_CP932
1336 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1342 #ifdef SHIFTJIS_CP932
1343 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1349 #ifdef UTF8_OUTPUT_ENABLE
1350 ms_ucs_map_f = UCS_MAP_CP932;
1354 #ifdef UTF8_OUTPUT_ENABLE
1355 ms_ucs_map_f = UCS_MAP_CP10001;
1360 #ifdef SHIFTJIS_CP932
1361 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1363 #ifdef UTF8_OUTPUT_ENABLE
1364 ms_ucs_map_f = UCS_MAP_ASCII;
1369 #ifdef SHIFTJIS_CP932
1370 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1372 #ifdef UTF8_OUTPUT_ENABLE
1373 ms_ucs_map_f = UCS_MAP_ASCII;
1377 #ifdef SHIFTJIS_CP932
1378 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1380 #ifdef UTF8_OUTPUT_ENABLE
1381 ms_ucs_map_f = UCS_MAP_CP932;
1386 #ifdef UTF8_OUTPUT_ENABLE
1387 ms_ucs_map_f = UCS_MAP_MS;
1392 #ifdef UTF8_OUTPUT_ENABLE
1393 ms_ucs_map_f = UCS_MAP_ASCII;
1396 case SHIFT_JISX0213:
1397 case SHIFT_JIS_2004:
1399 #ifdef SHIFTJIS_CP932
1400 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1407 #ifdef SHIFTJIS_CP932
1408 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1411 #ifdef UTF8_OUTPUT_ENABLE
1413 output_bom_f = TRUE;
1417 output_bom_f = TRUE;
1420 output_endian = ENDIAN_LITTLE;
1421 output_bom_f = FALSE;
1424 output_endian = ENDIAN_LITTLE;
1425 output_bom_f = TRUE;
1428 output_bom_f = TRUE;
1431 output_endian = ENDIAN_LITTLE;
1432 output_bom_f = FALSE;
1435 output_endian = ENDIAN_LITTLE;
1436 output_bom_f = TRUE;
1442 static struct input_code*
1443 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1446 struct input_code *p = input_code_list;
1448 if (iconv_func == p->iconv_func){
1458 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1460 #ifdef INPUT_CODE_FIX
1461 if (f || !input_encoding)
1468 #ifdef INPUT_CODE_FIX
1469 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1475 if (estab_f && iconv_for_check != iconv){
1476 struct input_code *p = find_inputcode_byfunc(iconv);
1478 set_input_codename(p->name);
1481 iconv_for_check = iconv;
1488 x0212_shift(nkf_char c)
1493 if (0x75 <= c && c <= 0x7f){
1494 ret = c + (0x109 - 0x75);
1497 if (0x75 <= c && c <= 0x7f){
1498 ret = c + (0x113 - 0x75);
1506 x0212_unshift(nkf_char c)
1509 if (0x7f <= c && c <= 0x88){
1510 ret = c + (0x75 - 0x7f);
1511 }else if (0x89 <= c && c <= 0x92){
1512 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1516 #endif /* X0212_ENABLE */
1519 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1525 if((0x21 <= ndx && ndx <= 0x2F)){
1526 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1527 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1529 }else if(0x6E <= ndx && ndx <= 0x7E){
1530 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1531 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1537 else if(nkf_isgraph(ndx)){
1539 const unsigned short *ptr;
1540 ptr = x0212_shiftjis[ndx - 0x21];
1542 val = ptr[(c1 & 0x7f) - 0x21];
1551 c2 = x0212_shift(c2);
1553 #endif /* X0212_ENABLE */
1555 if(0x7F < c2) return 1;
1556 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1557 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1562 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1564 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1567 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1568 if (0xFC < c1) return 1;
1569 #ifdef SHIFTJIS_CP932
1570 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1571 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1578 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1579 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1585 #endif /* SHIFTJIS_CP932 */
1587 if (!x0213_f && is_ibmext_in_sjis(c2)){
1588 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1591 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1604 if(x0213_f && c2 >= 0xF0){
1605 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1606 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1607 }else{ /* 78<=k<=94 */
1608 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1609 if (0x9E < c1) c2++;
1612 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1613 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1614 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1615 if (0x9E < c1) c2++;
1618 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1625 c2 = x0212_unshift(c2);
1632 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1634 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1642 }else if (val < 0x800){
1643 *p1 = 0xc0 | (val >> 6);
1644 *p2 = 0x80 | (val & 0x3f);
1647 } else if (nkf_char_unicode_bmp_p(val)) {
1648 *p1 = 0xe0 | (val >> 12);
1649 *p2 = 0x80 | ((val >> 6) & 0x3f);
1650 *p3 = 0x80 | ( val & 0x3f);
1652 } else if (nkf_char_unicode_value_p(val)) {
1653 *p1 = 0xe0 | (val >> 16);
1654 *p2 = 0x80 | ((val >> 12) & 0x3f);
1655 *p3 = 0x80 | ((val >> 6) & 0x3f);
1656 *p4 = 0x80 | ( val & 0x3f);
1666 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1673 else if (c1 <= 0xC3) {
1674 /* trail byte or invalid */
1677 else if (c1 <= 0xDF) {
1679 wc = (c1 & 0x1F) << 6;
1682 else if (c1 <= 0xEF) {
1684 wc = (c1 & 0x0F) << 12;
1685 wc |= (c2 & 0x3F) << 6;
1688 else if (c2 <= 0xF4) {
1690 wc = (c1 & 0x0F) << 18;
1691 wc |= (c2 & 0x3F) << 12;
1692 wc |= (c3 & 0x3F) << 6;
1702 #ifdef UTF8_INPUT_ENABLE
1704 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1705 const unsigned short *const *pp, nkf_char psize,
1706 nkf_char *p2, nkf_char *p1)
1709 const unsigned short *p;
1712 if (pp == 0) return 1;
1715 if (c1 < 0 || psize <= c1) return 1;
1717 if (p == 0) return 1;
1720 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1722 if (val == 0) return 1;
1723 if (no_cp932ext_f && (
1724 (val>>8) == 0x2D || /* NEC special characters */
1725 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1733 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1741 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1743 const unsigned short *const *pp;
1744 const unsigned short *const *const *ppp;
1745 static const char no_best_fit_chars_table_C2[] =
1746 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1748 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1749 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1750 static const char no_best_fit_chars_table_C2_ms[] =
1751 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1753 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1754 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1755 static const char no_best_fit_chars_table_932_C2[] =
1756 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1759 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1760 static const char no_best_fit_chars_table_932_C3[] =
1761 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1763 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1764 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1770 }else if(c2 < 0xe0){
1771 if(no_best_fit_chars_f){
1772 if(ms_ucs_map_f == UCS_MAP_CP932){
1775 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1778 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1781 }else if(!cp932inv_f){
1784 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1787 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1790 }else if(ms_ucs_map_f == UCS_MAP_MS){
1791 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1792 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1810 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1811 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1812 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1814 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1815 }else if(c0 < 0xF0){
1816 if(no_best_fit_chars_f){
1817 if(ms_ucs_map_f == UCS_MAP_CP932){
1818 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1819 }else if(ms_ucs_map_f == UCS_MAP_MS){
1824 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1827 if(c0 == 0x92) return 1;
1832 if(c1 == 0x80 || c0 == 0x9C) return 1;
1835 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1840 if(c0 == 0x94) return 1;
1843 if(c0 == 0xBB) return 1;
1853 if(c0 == 0x95) return 1;
1856 if(c0 == 0xA5) return 1;
1863 if(c0 == 0x8D) return 1;
1866 if(c0 == 0x9E && !cp932inv_f) return 1;
1869 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1877 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1878 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1879 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1881 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1883 #ifdef SHIFTJIS_CP932
1884 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1886 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1887 s2e_conv(s2, s1, p2, p1);
1896 #ifdef UTF8_OUTPUT_ENABLE
1898 e2w_conv(nkf_char c2, nkf_char c1)
1900 const unsigned short *p;
1902 if (c2 == JIS_X_0201_1976_K) {
1903 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1911 p = euc_to_utf8_1byte;
1913 } else if (is_eucg3(c2)){
1914 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1917 c2 = (c2&0x7f) - 0x21;
1918 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1919 p = x0212_to_utf8_2bytes[c2];
1925 c2 = (c2&0x7f) - 0x21;
1926 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1928 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1929 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1930 euc_to_utf8_2bytes_ms[c2];
1935 c1 = (c1 & 0x7f) - 0x21;
1936 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1943 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1950 }else if (0xc0 <= c2 && c2 <= 0xef) {
1951 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1952 #ifdef NUMCHAR_OPTION
1955 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1963 #ifdef UTF8_INPUT_ENABLE
1965 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1967 nkf_char c1, c2, c3, c4;
1974 else if (nkf_char_unicode_bmp_p(val)){
1975 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1976 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1979 *p1 = nkf_char_unicode_new(val);
1985 *p1 = nkf_char_unicode_new(val);
1992 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
1994 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
1995 if (iso2022jp_f && !x0201_f) {
1996 c2 = GETA1; c1 = GETA2;
1998 c2 = JIS_X_0201_1976_K;
2002 }else if (c2 == 0x8f){
2006 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2007 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2008 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2011 c2 = (c2 << 8) | (c1 & 0x7f);
2013 #ifdef SHIFTJIS_CP932
2016 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2017 s2e_conv(s2, s1, &c2, &c1);
2024 #endif /* SHIFTJIS_CP932 */
2026 #endif /* X0212_ENABLE */
2027 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2030 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2031 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2032 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2037 #ifdef SHIFTJIS_CP932
2038 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2040 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2041 s2e_conv(s2, s1, &c2, &c1);
2048 #endif /* SHIFTJIS_CP932 */
2056 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2058 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2059 if (iso2022jp_f && !x0201_f) {
2060 c2 = GETA1; c1 = GETA2;
2064 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2066 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2068 if(c1 == 0x7F) return 0;
2069 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2072 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2073 if (ret) return ret;
2080 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2082 nkf_char ret = 0, c4 = 0;
2083 static const char w_iconv_utf8_1st_byte[] =
2085 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2086 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2087 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2088 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2095 if (c1 < 0 || 0xff < c1) {
2096 }else if (c1 == 0) { /* 0 : 1 byte*/
2098 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2101 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2103 if (c2 < 0x80 || 0xBF < c2) return 0;
2106 if (c3 == 0) return -1;
2107 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2112 if (c3 == 0) return -1;
2113 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2117 if (c3 == 0) return -1;
2118 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2122 if (c3 == 0) return -2;
2123 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2127 if (c3 == 0) return -2;
2128 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2132 if (c3 == 0) return -2;
2133 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2141 if (c1 == 0 || c1 == EOF){
2142 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2143 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2146 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2154 #define NKF_ICONV_INVALID_CODE_RANGE -13
2156 unicode_iconv(nkf_char wc)
2164 }else if ((wc>>11) == 27) {
2165 /* unpaired surrogate */
2166 return NKF_ICONV_INVALID_CODE_RANGE;
2167 }else if (wc < 0xFFFF) {
2168 ret = w16e_conv(wc, &c2, &c1);
2169 if (ret) return ret;
2170 }else if (wc < 0x10FFFF) {
2172 c1 = nkf_char_unicode_new(wc);
2174 return NKF_ICONV_INVALID_CODE_RANGE;
2180 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2181 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2182 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2184 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2193 if (input_endian == ENDIAN_BIG) {
2194 if (0xD8 <= c1 && c1 <= 0xDB) {
2195 if (0xDC <= c3 && c3 <= 0xDF) {
2196 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2197 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2202 if (0xD8 <= c2 && c2 <= 0xDB) {
2203 if (0xDC <= c4 && c4 <= 0xDF) {
2204 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2205 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2211 return (*unicode_iconv)(wc);
2215 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2221 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2227 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2236 switch(input_endian){
2238 wc = c2 << 16 | c3 << 8 | c4;
2241 wc = c3 << 16 | c2 << 8 | c1;
2244 wc = c1 << 16 | c4 << 8 | c3;
2247 wc = c4 << 16 | c1 << 8 | c2;
2250 return NKF_ICONV_INVALID_CODE_RANGE;
2253 return (*unicode_iconv)(wc);
2257 #define output_ascii_escape_sequence(mode) do { \
2258 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2261 (*o_putc)(ascii_intro); \
2262 output_mode = mode; \
2267 output_escape_sequence(int mode)
2269 if (output_mode == mode)
2277 case JIS_X_0201_1976_K:
2285 (*o_putc)(kanji_intro);
2310 j_oconv(nkf_char c2, nkf_char c1)
2312 #ifdef NUMCHAR_OPTION
2313 if (c2 == 0 && nkf_char_unicode_p(c1)){
2314 w16e_conv(c1, &c2, &c1);
2315 if (c2 == 0 && nkf_char_unicode_p(c1)){
2316 c2 = c1 & VALUE_MASK;
2317 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2320 c2 = 0x7F + c1 / 94;
2321 c1 = 0x21 + c1 % 94;
2323 if (encode_fallback) (*encode_fallback)(c1);
2330 output_ascii_escape_sequence(ASCII);
2333 else if (c2 == EOF) {
2334 output_ascii_escape_sequence(ASCII);
2337 else if (c2 == ISO_8859_1) {
2338 output_ascii_escape_sequence(ISO_8859_1);
2341 else if (c2 == JIS_X_0201_1976_K) {
2342 output_escape_sequence(JIS_X_0201_1976_K);
2345 } else if (is_eucg3(c2)){
2346 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2347 (*o_putc)(c2 & 0x7f);
2352 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2353 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2354 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2361 e_oconv(nkf_char c2, nkf_char c1)
2363 if (c2 == 0 && nkf_char_unicode_p(c1)){
2364 w16e_conv(c1, &c2, &c1);
2365 if (c2 == 0 && nkf_char_unicode_p(c1)){
2366 c2 = c1 & VALUE_MASK;
2367 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2371 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2372 c1 = 0x21 + c1 % 94;
2375 (*o_putc)((c2 & 0x7f) | 0x080);
2376 (*o_putc)(c1 | 0x080);
2378 (*o_putc)((c2 & 0x7f) | 0x080);
2379 (*o_putc)(c1 | 0x080);
2383 if (encode_fallback) (*encode_fallback)(c1);
2391 } else if (c2 == 0) {
2392 output_mode = ASCII;
2394 } else if (c2 == JIS_X_0201_1976_K) {
2395 output_mode = EUC_JP;
2396 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2397 } else if (c2 == ISO_8859_1) {
2398 output_mode = ISO_8859_1;
2399 (*o_putc)(c1 | 0x080);
2401 } else if (is_eucg3(c2)){
2402 output_mode = EUC_JP;
2403 #ifdef SHIFTJIS_CP932
2406 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2407 s2e_conv(s2, s1, &c2, &c1);
2412 output_mode = ASCII;
2414 }else if (is_eucg3(c2)){
2417 (*o_putc)((c2 & 0x7f) | 0x080);
2418 (*o_putc)(c1 | 0x080);
2421 (*o_putc)((c2 & 0x7f) | 0x080);
2422 (*o_putc)(c1 | 0x080);
2426 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2427 set_iconv(FALSE, 0);
2428 return; /* too late to rescue this char */
2430 output_mode = EUC_JP;
2431 (*o_putc)(c2 | 0x080);
2432 (*o_putc)(c1 | 0x080);
2437 s_oconv(nkf_char c2, nkf_char c1)
2439 #ifdef NUMCHAR_OPTION
2440 if (c2 == 0 && nkf_char_unicode_p(c1)){
2441 w16e_conv(c1, &c2, &c1);
2442 if (c2 == 0 && nkf_char_unicode_p(c1)){
2443 c2 = c1 & VALUE_MASK;
2444 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2447 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2449 c1 += 0x40 + (c1 > 0x3e);
2454 if(encode_fallback)(*encode_fallback)(c1);
2463 } else if (c2 == 0) {
2464 output_mode = ASCII;
2466 } else if (c2 == JIS_X_0201_1976_K) {
2467 output_mode = SHIFT_JIS;
2469 } else if (c2 == ISO_8859_1) {
2470 output_mode = ISO_8859_1;
2471 (*o_putc)(c1 | 0x080);
2473 } else if (is_eucg3(c2)){
2474 output_mode = SHIFT_JIS;
2475 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2481 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2482 set_iconv(FALSE, 0);
2483 return; /* too late to rescue this char */
2485 output_mode = SHIFT_JIS;
2486 e2s_conv(c2, c1, &c2, &c1);
2488 #ifdef SHIFTJIS_CP932
2490 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2491 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2497 #endif /* SHIFTJIS_CP932 */
2500 if (prefix_table[(unsigned char)c1]){
2501 (*o_putc)(prefix_table[(unsigned char)c1]);
2507 #ifdef UTF8_OUTPUT_ENABLE
2509 w_oconv(nkf_char c2, nkf_char c1)
2515 output_bom_f = FALSE;
2526 if (c2 == 0 && nkf_char_unicode_p(c1)){
2527 val = c1 & VALUE_MASK;
2528 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2530 if (c2) (*o_putc)(c2);
2531 if (c3) (*o_putc)(c3);
2532 if (c4) (*o_putc)(c4);
2539 val = e2w_conv(c2, c1);
2541 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2543 if (c2) (*o_putc)(c2);
2544 if (c3) (*o_putc)(c3);
2545 if (c4) (*o_putc)(c4);
2551 w_oconv16(nkf_char c2, nkf_char c1)
2554 output_bom_f = FALSE;
2555 if (output_endian == ENDIAN_LITTLE){
2569 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2570 if (nkf_char_unicode_bmp_p(c1)) {
2571 c2 = (c1 >> 8) & 0xff;
2575 if (c1 <= UNICODE_MAX) {
2576 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2577 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2578 if (output_endian == ENDIAN_LITTLE){
2579 (*o_putc)(c2 & 0xff);
2580 (*o_putc)((c2 >> 8) & 0xff);
2581 (*o_putc)(c1 & 0xff);
2582 (*o_putc)((c1 >> 8) & 0xff);
2584 (*o_putc)((c2 >> 8) & 0xff);
2585 (*o_putc)(c2 & 0xff);
2586 (*o_putc)((c1 >> 8) & 0xff);
2587 (*o_putc)(c1 & 0xff);
2593 nkf_char val = e2w_conv(c2, c1);
2594 c2 = (val >> 8) & 0xff;
2599 if (output_endian == ENDIAN_LITTLE){
2609 w_oconv32(nkf_char c2, nkf_char c1)
2612 output_bom_f = FALSE;
2613 if (output_endian == ENDIAN_LITTLE){
2631 if (c2 == ISO_8859_1) {
2633 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2636 c1 = e2w_conv(c2, c1);
2639 if (output_endian == ENDIAN_LITTLE){
2640 (*o_putc)( c1 & 0xFF);
2641 (*o_putc)((c1 >> 8) & 0xFF);
2642 (*o_putc)((c1 >> 16) & 0xFF);
2646 (*o_putc)((c1 >> 16) & 0xFF);
2647 (*o_putc)((c1 >> 8) & 0xFF);
2648 (*o_putc)( c1 & 0xFF);
2653 #define SCORE_L2 (1) /* Kanji Level 2 */
2654 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2655 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2656 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2657 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2658 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */
2659 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2660 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2662 #define SCORE_INIT (SCORE_iMIME)
2664 static const nkf_char score_table_A0[] = {
2667 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2668 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2671 static const nkf_char score_table_F0[] = {
2672 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2673 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2674 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2675 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2679 set_code_score(struct input_code *ptr, nkf_char score)
2682 ptr->score |= score;
2687 clr_code_score(struct input_code *ptr, nkf_char score)
2690 ptr->score &= ~score;
2695 code_score(struct input_code *ptr)
2697 nkf_char c2 = ptr->buf[0];
2698 #ifdef UTF8_OUTPUT_ENABLE
2699 nkf_char c1 = ptr->buf[1];
2702 set_code_score(ptr, SCORE_ERROR);
2703 }else if (c2 == SS2){
2704 set_code_score(ptr, SCORE_KANA);
2705 }else if (c2 == 0x8f){
2706 set_code_score(ptr, SCORE_X0212);
2707 #ifdef UTF8_OUTPUT_ENABLE
2708 }else if (!e2w_conv(c2, c1)){
2709 set_code_score(ptr, SCORE_NO_EXIST);
2711 }else if ((c2 & 0x70) == 0x20){
2712 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2713 }else if ((c2 & 0x70) == 0x70){
2714 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2715 }else if ((c2 & 0x70) >= 0x50){
2716 set_code_score(ptr, SCORE_L2);
2721 status_disable(struct input_code *ptr)
2726 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2730 status_push_ch(struct input_code *ptr, nkf_char c)
2732 ptr->buf[ptr->index++] = c;
2736 status_clear(struct input_code *ptr)
2743 status_reset(struct input_code *ptr)
2746 ptr->score = SCORE_INIT;
2750 status_reinit(struct input_code *ptr)
2753 ptr->_file_stat = 0;
2757 status_check(struct input_code *ptr, nkf_char c)
2759 if (c <= DEL && estab_f){
2765 s_status(struct input_code *ptr, nkf_char c)
2769 status_check(ptr, c);
2774 }else if (nkf_char_unicode_p(c)){
2776 }else if (0xa1 <= c && c <= 0xdf){
2777 status_push_ch(ptr, SS2);
2778 status_push_ch(ptr, c);
2781 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2783 status_push_ch(ptr, c);
2784 }else if (0xed <= c && c <= 0xee){
2786 status_push_ch(ptr, c);
2787 #ifdef SHIFTJIS_CP932
2788 }else if (is_ibmext_in_sjis(c)){
2790 status_push_ch(ptr, c);
2791 #endif /* SHIFTJIS_CP932 */
2793 }else if (0xf0 <= c && c <= 0xfc){
2795 status_push_ch(ptr, c);
2796 #endif /* X0212_ENABLE */
2798 status_disable(ptr);
2802 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2803 status_push_ch(ptr, c);
2804 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2808 status_disable(ptr);
2812 #ifdef SHIFTJIS_CP932
2813 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2814 status_push_ch(ptr, c);
2815 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2816 set_code_score(ptr, SCORE_CP932);
2821 #endif /* SHIFTJIS_CP932 */
2822 status_disable(ptr);
2825 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2826 status_push_ch(ptr, c);
2827 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2828 set_code_score(ptr, SCORE_CP932);
2831 status_disable(ptr);
2838 e_status(struct input_code *ptr, nkf_char c)
2842 status_check(ptr, c);
2847 }else if (nkf_char_unicode_p(c)){
2849 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2851 status_push_ch(ptr, c);
2853 }else if (0x8f == c){
2855 status_push_ch(ptr, c);
2856 #endif /* X0212_ENABLE */
2858 status_disable(ptr);
2862 if (0xa1 <= c && c <= 0xfe){
2863 status_push_ch(ptr, c);
2867 status_disable(ptr);
2872 if (0xa1 <= c && c <= 0xfe){
2874 status_push_ch(ptr, c);
2876 status_disable(ptr);
2878 #endif /* X0212_ENABLE */
2882 #ifdef UTF8_INPUT_ENABLE
2884 w_status(struct input_code *ptr, nkf_char c)
2888 status_check(ptr, c);
2893 }else if (nkf_char_unicode_p(c)){
2895 }else if (0xc0 <= c && c <= 0xdf){
2897 status_push_ch(ptr, c);
2898 }else if (0xe0 <= c && c <= 0xef){
2900 status_push_ch(ptr, c);
2901 }else if (0xf0 <= c && c <= 0xf4){
2903 status_push_ch(ptr, c);
2905 status_disable(ptr);
2910 if (0x80 <= c && c <= 0xbf){
2911 status_push_ch(ptr, c);
2912 if (ptr->index > ptr->stat){
2913 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2914 && ptr->buf[2] == 0xbf);
2915 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2916 &ptr->buf[0], &ptr->buf[1]);
2923 status_disable(ptr);
2927 if (0x80 <= c && c <= 0xbf){
2928 if (ptr->index < ptr->stat){
2929 status_push_ch(ptr, c);
2934 status_disable(ptr);
2942 code_status(nkf_char c)
2944 int action_flag = 1;
2945 struct input_code *result = 0;
2946 struct input_code *p = input_code_list;
2948 if (!p->status_func) {
2952 if (!p->status_func)
2954 (p->status_func)(p, c);
2957 }else if(p->stat == 0){
2968 if (result && !estab_f){
2969 set_iconv(TRUE, result->iconv_func);
2970 }else if (c <= DEL){
2971 struct input_code *ptr = input_code_list;
2981 nkf_buf_t *std_gc_buf;
2982 nkf_char broken_state;
2983 nkf_buf_t *broken_buf;
2984 nkf_char mimeout_state;
2988 static nkf_state_t *nkf_state = NULL;
2990 #define STD_GC_BUFSIZE (256)
2993 nkf_state_init(void)
2996 nkf_buf_clear(nkf_state->std_gc_buf);
2997 nkf_buf_clear(nkf_state->broken_buf);
2998 nkf_buf_clear(nkf_state->nfc_buf);
3001 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3002 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3003 nkf_state->broken_buf = nkf_buf_new(3);
3004 nkf_state->nfc_buf = nkf_buf_new(9);
3006 nkf_state->broken_state = 0;
3007 nkf_state->mimeout_state = 0;
3014 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3015 return nkf_buf_pop(nkf_state->std_gc_buf);
3022 std_ungetc(nkf_char c, FILE *f)
3024 nkf_buf_push(nkf_state->std_gc_buf, c);
3030 std_putc(nkf_char c)
3037 static unsigned char hold_buf[HOLD_SIZE*2];
3038 static int hold_count = 0;
3040 push_hold_buf(nkf_char c2)
3042 if (hold_count >= HOLD_SIZE*2)
3044 hold_buf[hold_count++] = (unsigned char)c2;
3045 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3049 h_conv(FILE *f, int c1, int c2)
3055 /** it must NOT be in the kanji shifte sequence */
3056 /** it must NOT be written in JIS7 */
3057 /** and it must be after 2 byte 8bit code */
3063 while ((c2 = (*i_getc)(f)) != EOF) {
3069 if (push_hold_buf(c2) == EOF || estab_f) {
3075 struct input_code *p = input_code_list;
3076 struct input_code *result = p;
3081 if (p->status_func && p->score < result->score) {
3086 set_iconv(TRUE, result->iconv_func);
3091 ** 1) EOF is detected, or
3092 ** 2) Code is established, or
3093 ** 3) Buffer is FULL (but last word is pushed)
3095 ** in 1) and 3) cases, we continue to use
3096 ** Kanji codes by oconv and leave estab_f unchanged.
3101 while (hold_index < hold_count){
3102 c1 = hold_buf[hold_index++];
3106 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3107 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3110 if (hold_index < hold_count){
3111 c2 = hold_buf[hold_index++];
3121 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3124 if (hold_index < hold_count){
3125 c3 = hold_buf[hold_index++];
3126 } else if ((c3 = (*i_getc)(f)) == EOF) {
3131 if (hold_index < hold_count){
3132 c4 = hold_buf[hold_index++];
3133 } else if ((c4 = (*i_getc)(f)) == EOF) {
3138 (*iconv)(c1, c2, (c3<<8)|c4);
3143 /* 3 bytes EUC or UTF-8 */
3144 if (hold_index < hold_count){
3145 c3 = hold_buf[hold_index++];
3146 } else if ((c3 = (*i_getc)(f)) == EOF) {
3152 (*iconv)(c1, c2, c3);
3155 if (c3 == EOF) break;
3161 * Check and Ignore BOM
3167 switch(c2 = (*i_getc)(f)){
3169 if((c2 = (*i_getc)(f)) == 0x00){
3170 if((c2 = (*i_getc)(f)) == 0xFE){
3171 if((c2 = (*i_getc)(f)) == 0xFF){
3172 if(!input_encoding){
3173 set_iconv(TRUE, w_iconv32);
3175 if (iconv == w_iconv32) {
3176 input_endian = ENDIAN_BIG;
3179 (*i_ungetc)(0xFF,f);
3180 }else (*i_ungetc)(c2,f);
3181 (*i_ungetc)(0xFE,f);
3182 }else if(c2 == 0xFF){
3183 if((c2 = (*i_getc)(f)) == 0xFE){
3184 if(!input_encoding){
3185 set_iconv(TRUE, w_iconv32);
3187 if (iconv == w_iconv32) {
3188 input_endian = ENDIAN_2143;
3191 (*i_ungetc)(0xFF,f);
3192 }else (*i_ungetc)(c2,f);
3193 (*i_ungetc)(0xFF,f);
3194 }else (*i_ungetc)(c2,f);
3195 (*i_ungetc)(0x00,f);
3196 }else (*i_ungetc)(c2,f);
3197 (*i_ungetc)(0x00,f);
3200 if((c2 = (*i_getc)(f)) == 0xBB){
3201 if((c2 = (*i_getc)(f)) == 0xBF){
3202 if(!input_encoding){
3203 set_iconv(TRUE, w_iconv);
3205 if (iconv == w_iconv) {
3208 (*i_ungetc)(0xBF,f);
3209 }else (*i_ungetc)(c2,f);
3210 (*i_ungetc)(0xBB,f);
3211 }else (*i_ungetc)(c2,f);
3212 (*i_ungetc)(0xEF,f);
3215 if((c2 = (*i_getc)(f)) == 0xFF){
3216 if((c2 = (*i_getc)(f)) == 0x00){
3217 if((c2 = (*i_getc)(f)) == 0x00){
3218 if(!input_encoding){
3219 set_iconv(TRUE, w_iconv32);
3221 if (iconv == w_iconv32) {
3222 input_endian = ENDIAN_3412;
3225 (*i_ungetc)(0x00,f);
3226 }else (*i_ungetc)(c2,f);
3227 (*i_ungetc)(0x00,f);
3228 }else (*i_ungetc)(c2,f);
3229 if(!input_encoding){
3230 set_iconv(TRUE, w_iconv16);
3232 if (iconv == w_iconv16) {
3233 input_endian = ENDIAN_BIG;
3236 (*i_ungetc)(0xFF,f);
3237 }else (*i_ungetc)(c2,f);
3238 (*i_ungetc)(0xFE,f);
3241 if((c2 = (*i_getc)(f)) == 0xFE){
3242 if((c2 = (*i_getc)(f)) == 0x00){
3243 if((c2 = (*i_getc)(f)) == 0x00){
3244 if(!input_encoding){
3245 set_iconv(TRUE, w_iconv32);
3247 if (iconv == w_iconv32) {
3248 input_endian = ENDIAN_LITTLE;
3251 (*i_ungetc)(0x00,f);
3252 }else (*i_ungetc)(c2,f);
3253 (*i_ungetc)(0x00,f);
3254 }else (*i_ungetc)(c2,f);
3255 if(!input_encoding){
3256 set_iconv(TRUE, w_iconv16);
3258 if (iconv == w_iconv16) {
3259 input_endian = ENDIAN_LITTLE;
3262 (*i_ungetc)(0xFE,f);
3263 }else (*i_ungetc)(c2,f);
3264 (*i_ungetc)(0xFF,f);
3273 broken_getc(FILE *f)
3277 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3278 return nkf_buf_pop(nkf_state->broken_buf);
3281 if (c=='$' && nkf_state->broken_state != ESC
3282 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3284 nkf_state->broken_state = 0;
3285 if (c1=='@'|| c1=='B') {
3286 nkf_buf_push(nkf_state->broken_buf, c1);
3287 nkf_buf_push(nkf_state->broken_buf, c);
3293 } else if (c=='(' && nkf_state->broken_state != ESC
3294 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3296 nkf_state->broken_state = 0;
3297 if (c1=='J'|| c1=='B') {
3298 nkf_buf_push(nkf_state->broken_buf, c1);
3299 nkf_buf_push(nkf_state->broken_buf, c);
3306 nkf_state->broken_state = c;
3312 broken_ungetc(nkf_char c, FILE *f)
3314 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3315 nkf_buf_push(nkf_state->broken_buf, c);
3320 eol_conv(nkf_char c2, nkf_char c1)
3322 if (guess_f && input_eol != EOF) {
3323 if (c2 == 0 && c1 == LF) {
3324 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3325 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3326 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3328 else if (!input_eol) input_eol = CR;
3329 else if (input_eol != CR) input_eol = EOF;
3331 if (prev_cr || (c2 == 0 && c1 == LF)) {
3333 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3334 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3336 if (c2 == 0 && c1 == CR) prev_cr = CR;
3337 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3341 Return value of fold_conv()
3343 LF add newline and output char
3344 CR add newline and output nothing
3347 1 (or else) normal output
3349 fold state in prev (previous character)
3351 >0x80 Japanese (X0208/X0201)
3356 This fold algorthm does not preserve heading space in a line.
3357 This is the main difference from fmt.
3360 #define char_size(c2,c1) (c2?2:1)
3363 fold_conv(nkf_char c2, nkf_char c1)
3366 nkf_char fold_state;
3368 if (c1== CR && !fold_preserve_f) {
3369 fold_state=0; /* ignore cr */
3370 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3372 fold_state=0; /* ignore cr */
3373 } else if (c1== BS) {
3374 if (f_line>0) f_line--;
3376 } else if (c2==EOF && f_line != 0) { /* close open last line */
3378 } else if ((c1==LF && !fold_preserve_f)
3379 || ((c1==CR||(c1==LF&&f_prev!=CR))
3380 && fold_preserve_f)) {
3382 if (fold_preserve_f) {
3386 } else if ((f_prev == c1 && !fold_preserve_f)
3387 || (f_prev == LF && fold_preserve_f)
3388 ) { /* duplicate newline */
3391 fold_state = LF; /* output two newline */
3397 if (f_prev&0x80) { /* Japanese? */
3399 fold_state = 0; /* ignore given single newline */
3400 } else if (f_prev==SP) {
3404 if (++f_line<=fold_len)
3408 fold_state = CR; /* fold and output nothing */
3412 } else if (c1=='\f') {
3415 fold_state = LF; /* output newline and clear */
3416 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3417 /* X0208 kankaku or ascii space */
3419 fold_state = 0; /* remove duplicate spaces */
3422 if (++f_line<=fold_len)
3423 fold_state = SP; /* output ASCII space only */
3425 f_prev = SP; f_line = 0;
3426 fold_state = CR; /* fold and output nothing */
3430 prev0 = f_prev; /* we still need this one... , but almost done */
3432 if (c2 || c2 == JIS_X_0201_1976_K)
3433 f_prev |= 0x80; /* this is Japanese */
3434 f_line += char_size(c2,c1);
3435 if (f_line<=fold_len) { /* normal case */
3438 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3439 f_line = char_size(c2,c1);
3440 fold_state = LF; /* We can't wait, do fold now */
3441 } else if (c2 == JIS_X_0201_1976_K) {
3442 /* simple kinsoku rules return 1 means no folding */
3443 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3444 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3445 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3446 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3447 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3448 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3449 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3451 fold_state = LF;/* add one new f_line before this character */
3454 fold_state = LF;/* add one new f_line before this character */
3457 /* kinsoku point in ASCII */
3458 if ( c1==')'|| /* { [ ( */
3469 /* just after special */
3470 } else if (!is_alnum(prev0)) {
3471 f_line = char_size(c2,c1);
3473 } else if ((prev0==SP) || /* ignored new f_line */
3474 (prev0==LF)|| /* ignored new f_line */
3475 (prev0&0x80)) { /* X0208 - ASCII */
3476 f_line = char_size(c2,c1);
3477 fold_state = LF;/* add one new f_line before this character */
3479 fold_state = 1; /* default no fold in ASCII */
3483 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3484 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3485 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3486 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3487 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3488 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3489 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3490 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3491 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3492 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3493 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3494 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3495 /* default no fold in kinsoku */
3498 f_line = char_size(c2,c1);
3499 /* add one new f_line before this character */
3502 f_line = char_size(c2,c1);
3504 /* add one new f_line before this character */
3509 /* terminator process */
3510 switch(fold_state) {
3512 OCONV_NEWLINE((*o_fconv));
3518 OCONV_NEWLINE((*o_fconv));
3529 static nkf_char z_prev2=0,z_prev1=0;
3532 z_conv(nkf_char c2, nkf_char c1)
3535 /* if (c2) c1 &= 0x7f; assertion */
3537 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3543 if (z_prev2 == JIS_X_0201_1976_K) {
3544 if (c2 == JIS_X_0201_1976_K) {
3545 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3547 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3549 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3551 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3556 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3558 if (c2 == JIS_X_0201_1976_K) {
3559 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3560 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3565 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3576 if (alpha_f&1 && c2 == 0x23) {
3577 /* JISX0208 Alphabet */
3579 } else if (c2 == 0x21) {
3580 /* JISX0208 Kigou */
3585 } else if (alpha_f&4) {
3590 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3596 if (alpha_f&8 && c2 == 0) {
3598 const char *entity = 0;
3600 case '>': entity = ">"; break;
3601 case '<': entity = "<"; break;
3602 case '\"': entity = """; break;
3603 case '&': entity = "&"; break;
3606 while (*entity) (*o_zconv)(0, *entity++);
3612 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3617 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3621 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3625 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3629 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3633 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3637 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3641 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3645 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3650 (*o_zconv)(JIS_X_0201_1976_K, c);
3653 } else if (c2 == 0x25) {
3654 /* JISX0208 Katakana */
3655 static const int fullwidth_to_halfwidth[] =
3657 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3658 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3659 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3660 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3661 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3662 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3663 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3664 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3665 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3666 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3667 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3668 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3670 if (fullwidth_to_halfwidth[c1-0x20]){
3671 c2 = fullwidth_to_halfwidth[c1-0x20];
3672 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3674 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3684 #define rot13(c) ( \
3686 (c <= 'M') ? (c + 13): \
3687 (c <= 'Z') ? (c - 13): \
3689 (c <= 'm') ? (c + 13): \
3690 (c <= 'z') ? (c - 13): \
3694 #define rot47(c) ( \
3696 ( c <= 'O') ? (c + 47) : \
3697 ( c <= '~') ? (c - 47) : \
3702 rot_conv(nkf_char c2, nkf_char c1)
3704 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3710 (*o_rot_conv)(c2,c1);
3714 hira_conv(nkf_char c2, nkf_char c1)
3718 if (0x20 < c1 && c1 < 0x74) {
3720 (*o_hira_conv)(c2,c1);
3722 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3724 c1 = nkf_char_unicode_new(0x3094);
3725 (*o_hira_conv)(c2,c1);
3728 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3730 (*o_hira_conv)(c2,c1);
3735 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3738 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3740 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3744 (*o_hira_conv)(c2,c1);
3749 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3751 #define RANGE_NUM_MAX 18
3752 static const nkf_char range[RANGE_NUM_MAX][2] = {
3773 nkf_char start, end, c;
3775 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3779 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3784 for (i = 0; i < RANGE_NUM_MAX; i++) {
3785 start = range[i][0];
3788 if (c >= start && c <= end) {
3793 (*o_iso2022jp_check_conv)(c2,c1);
3797 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3799 static const unsigned char *mime_pattern[] = {
3800 (const unsigned char *)"\075?EUC-JP?B?",
3801 (const unsigned char *)"\075?SHIFT_JIS?B?",
3802 (const unsigned char *)"\075?ISO-8859-1?Q?",
3803 (const unsigned char *)"\075?ISO-8859-1?B?",
3804 (const unsigned char *)"\075?ISO-2022-JP?B?",
3805 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3806 #if defined(UTF8_INPUT_ENABLE)
3807 (const unsigned char *)"\075?UTF-8?B?",
3808 (const unsigned char *)"\075?UTF-8?Q?",
3810 (const unsigned char *)"\075?US-ASCII?Q?",
3815 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3816 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3817 e_iconv, s_iconv, 0, 0, 0, 0,
3818 #if defined(UTF8_INPUT_ENABLE)
3824 static const nkf_char mime_encode[] = {
3825 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3826 #if defined(UTF8_INPUT_ENABLE)
3833 static const nkf_char mime_encode_method[] = {
3834 'B', 'B','Q', 'B', 'B', 'Q',
3835 #if defined(UTF8_INPUT_ENABLE)
3843 /* MIME preprocessor fifo */
3845 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3846 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3847 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3849 unsigned char buf[MIME_BUF_SIZE];
3851 unsigned int last; /* decoded */
3852 unsigned int input; /* undecoded */
3854 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3856 #define MAXRECOVER 20
3859 mime_input_buf_unshift(nkf_char c)
3861 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3865 mime_ungetc(nkf_char c, FILE *f)
3867 mime_input_buf_unshift(c);
3872 mime_ungetc_buf(nkf_char c, FILE *f)
3875 (*i_mungetc_buf)(c,f);
3877 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3882 mime_getc_buf(FILE *f)
3884 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3885 a terminator. It was checked in mime_integrity. */
3886 return ((mimebuf_f)?
3887 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3891 switch_mime_getc(void)
3893 if (i_getc!=mime_getc) {
3894 i_mgetc = i_getc; i_getc = mime_getc;
3895 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3896 if(mime_f==STRICT_MIME) {
3897 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3898 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3904 unswitch_mime_getc(void)
3906 if(mime_f==STRICT_MIME) {
3907 i_mgetc = i_mgetc_buf;
3908 i_mungetc = i_mungetc_buf;
3911 i_ungetc = i_mungetc;
3912 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3913 mime_iconv_back = NULL;
3917 mime_integrity(FILE *f, const unsigned char *p)
3921 /* In buffered mode, read until =? or NL or buffer full
3923 mime_input_state.input = mime_input_state.top;
3924 mime_input_state.last = mime_input_state.top;
3926 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3928 q = mime_input_state.input;
3929 while((c=(*i_getc)(f))!=EOF) {
3930 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3931 break; /* buffer full */
3933 if (c=='=' && d=='?') {
3934 /* checked. skip header, start decode */
3935 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3936 /* mime_last_input = mime_input_state.input; */
3937 mime_input_state.input = q;
3941 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3943 /* Should we check length mod 4? */
3944 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3947 /* In case of Incomplete MIME, no MIME decode */
3948 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3949 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3950 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3951 switch_mime_getc(); /* anyway we need buffered getc */
3956 mime_begin_strict(FILE *f)
3960 const unsigned char *p,*q;
3961 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3963 mime_decode_mode = FALSE;
3964 /* =? has been checked */
3966 p = mime_pattern[j];
3969 for(i=2;p[i]>SP;i++) { /* start at =? */
3970 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3971 /* pattern fails, try next one */
3973 while (mime_pattern[++j]) {
3974 p = mime_pattern[j];
3975 for(k=2;k<i;k++) /* assume length(p) > i */
3976 if (p[k]!=q[k]) break;
3977 if (k==i && nkf_toupper(c1)==p[k]) break;
3979 p = mime_pattern[j];
3980 if (p) continue; /* found next one, continue */
3981 /* all fails, output from recovery buffer */
3989 mime_decode_mode = p[i-2];
3991 mime_iconv_back = iconv;
3992 set_iconv(FALSE, mime_priority_func[j]);
3993 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
3995 if (mime_decode_mode=='B') {
3996 mimebuf_f = unbuf_f;
3998 /* do MIME integrity check */
3999 return mime_integrity(f,mime_pattern[j]);
4013 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4014 /* re-read and convert again from mime_buffer. */
4016 /* =? has been checked */
4017 k = mime_input_state.last;
4018 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4019 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4020 /* We accept any character type even if it is breaked by new lines */
4021 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4022 if (c1==LF||c1==SP||c1==CR||
4023 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4025 /* Failed. But this could be another MIME preemble */
4027 mime_input_state.last--;
4033 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4034 if (!(++i<MAXRECOVER) || c1==EOF) break;
4035 if (c1=='b'||c1=='B') {
4036 mime_decode_mode = 'B';
4037 } else if (c1=='q'||c1=='Q') {
4038 mime_decode_mode = 'Q';
4042 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4043 if (!(++i<MAXRECOVER) || c1==EOF) break;
4045 mime_decode_mode = FALSE;
4051 if (!mime_decode_mode) {
4052 /* false MIME premble, restart from mime_buffer */
4053 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4054 /* Since we are in MIME mode until buffer becomes empty, */
4055 /* we never go into mime_begin again for a while. */
4058 /* discard mime preemble, and goto MIME mode */
4059 mime_input_state.last = k;
4060 /* do no MIME integrity check */
4061 return c1; /* used only for checking EOF */
4072 debug(const char *str)
4075 fprintf(stderr, "%s\n", str ? str : "NULL");
4081 set_input_codename(const char *codename)
4083 if (!input_codename) {
4084 input_codename = codename;
4085 } else if (strcmp(codename, input_codename) != 0) {
4086 input_codename = "";
4091 get_guessed_code(void)
4093 if (input_codename && !*input_codename) {
4094 input_codename = "BINARY";
4096 struct input_code *p = find_inputcode_byfunc(iconv);
4097 if (!input_codename) {
4098 input_codename = "ASCII";
4099 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4100 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4101 input_codename = "CP932";
4102 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4103 if (p->score & (SCORE_X0212))
4104 input_codename = "EUCJP-MS";
4105 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4106 input_codename = "CP51932";
4107 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4108 if (p->score & (SCORE_KANA))
4109 input_codename = "CP50221";
4110 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4111 input_codename = "CP50220";
4114 return input_codename;
4117 #if !defined(PERL_XS) && !defined(WIN32DLL)
4119 print_guessed_code(char *filename)
4121 if (filename != NULL) printf("%s: ", filename);
4122 if (input_codename && !*input_codename) {
4125 input_codename = get_guessed_code();
4127 printf("%s\n", input_codename);
4131 input_eol == CR ? " (CR)" :
4132 input_eol == LF ? " (LF)" :
4133 input_eol == CRLF ? " (CRLF)" :
4134 input_eol == EOF ? " (MIXED NL)" :
4144 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4146 nkf_char c1, c2, c3;
4152 if (!nkf_isxdigit(c2)){
4157 if (!nkf_isxdigit(c3)){
4162 return (hex2bin(c2) << 4) | hex2bin(c3);
4168 return hex_getc(':', f, i_cgetc, i_cungetc);
4172 cap_ungetc(nkf_char c, FILE *f)
4174 return (*i_cungetc)(c, f);
4180 return hex_getc('%', f, i_ugetc, i_uungetc);
4184 url_ungetc(nkf_char c, FILE *f)
4186 return (*i_uungetc)(c, f);
4190 #ifdef NUMCHAR_OPTION
4192 numchar_getc(FILE *f)
4194 nkf_char (*g)(FILE *) = i_ngetc;
4195 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4206 if (buf[i] == 'x' || buf[i] == 'X'){
4207 for (j = 0; j < 7; j++){
4209 if (!nkf_isxdigit(buf[i])){
4216 c |= hex2bin(buf[i]);
4219 for (j = 0; j < 8; j++){
4223 if (!nkf_isdigit(buf[i])){
4230 c += hex2bin(buf[i]);
4236 return nkf_char_unicode_new(c);
4246 numchar_ungetc(nkf_char c, FILE *f)
4248 return (*i_nungetc)(c, f);
4252 #ifdef UNICODE_NORMALIZATION
4257 nkf_char (*g)(FILE *f) = i_nfc_getc;
4258 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4259 nkf_buf_t *buf = nkf_state->nfc_buf;
4260 const unsigned char *array;
4261 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4262 nkf_char c = (*g)(f);
4264 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4266 nkf_buf_push(buf, (unsigned char)c);
4268 while (lower <= upper) {
4269 int mid = (lower+upper) / 2;
4271 array = normalization_table[mid].nfd;
4272 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4273 if (len >= nkf_buf_length(buf)) {
4277 lower = 1, upper = 0;
4280 nkf_buf_push(buf, c);
4282 if (array[len] != nkf_buf_at(buf, len)) {
4283 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4284 else upper = mid - 1;
4291 array = normalization_table[mid].nfc;
4293 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4294 nkf_buf_push(buf, array[i]);
4298 } while (lower <= upper);
4300 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4301 c = nkf_buf_pop(buf);
4307 nfc_ungetc(nkf_char c, FILE *f)
4309 return (*i_nfc_ungetc)(c, f);
4311 #endif /* UNICODE_NORMALIZATION */
4315 base64decode(nkf_char c)
4320 i = c - 'A'; /* A..Z 0-25 */
4321 } else if (c == '_') {
4322 i = '?' /* 63 */ ; /* _ 63 */
4324 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4326 } else if (c > '/') {
4327 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4328 } else if (c == '+' || c == '-') {
4329 i = '>' /* 62 */ ; /* + and - 62 */
4331 i = '?' /* 63 */ ; /* / 63 */
4339 nkf_char c1, c2, c3, c4, cc;
4340 nkf_char t1, t2, t3, t4, mode, exit_mode;
4341 nkf_char lwsp_count;
4344 nkf_char lwsp_size = 128;
4346 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4347 return mime_input_buf(mime_input_state.top++);
4349 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4350 mime_decode_mode=FALSE;
4351 unswitch_mime_getc();
4352 return (*i_getc)(f);
4355 if (mimebuf_f == FIXED_MIME)
4356 exit_mode = mime_decode_mode;
4359 if (mime_decode_mode == 'Q') {
4360 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4362 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4363 if (c1<=SP || DEL<=c1) {
4364 mime_decode_mode = exit_mode; /* prepare for quit */
4367 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4371 mime_decode_mode = exit_mode; /* prepare for quit */
4372 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4373 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4374 /* end Q encoding */
4375 input_mode = exit_mode;
4377 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4378 while ((c1=(*i_getc)(f))!=EOF) {
4383 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4391 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4392 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4407 lwsp_buf[lwsp_count] = (unsigned char)c1;
4408 if (lwsp_count++>lwsp_size){
4410 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4411 lwsp_buf = lwsp_buf_new;
4417 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4419 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4420 i_ungetc(lwsp_buf[lwsp_count],f);
4423 nkf_xfree(lwsp_buf);
4426 if (c1=='='&&c2<SP) { /* this is soft wrap */
4427 while((c1 = (*i_mgetc)(f)) <=SP) {
4428 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4430 mime_decode_mode = 'Q'; /* still in MIME */
4431 goto restart_mime_q;
4434 mime_decode_mode = 'Q'; /* still in MIME */
4438 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4439 if (c2<=SP) return c2;
4440 mime_decode_mode = 'Q'; /* still in MIME */
4441 return ((hex2bin(c2)<<4) + hex2bin(c3));
4444 if (mime_decode_mode != 'B') {
4445 mime_decode_mode = FALSE;
4446 return (*i_mgetc)(f);
4450 /* Base64 encoding */
4452 MIME allows line break in the middle of
4453 Base64, but we are very pessimistic in decoding
4454 in unbuf mode because MIME encoded code may broken by
4455 less or editor's control sequence (such as ESC-[-K in unbuffered
4456 mode. ignore incomplete MIME.
4458 mode = mime_decode_mode;
4459 mime_decode_mode = exit_mode; /* prepare for quit */
4461 while ((c1 = (*i_mgetc)(f))<=SP) {
4466 if ((c2 = (*i_mgetc)(f))<=SP) {
4469 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4470 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4473 if ((c1 == '?') && (c2 == '=')) {
4476 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4477 while ((c1=(*i_getc)(f))!=EOF) {
4482 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4490 if ((c1=(*i_getc)(f))!=EOF) {
4494 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4509 lwsp_buf[lwsp_count] = (unsigned char)c1;
4510 if (lwsp_count++>lwsp_size){
4512 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4513 lwsp_buf = lwsp_buf_new;
4519 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4521 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4522 i_ungetc(lwsp_buf[lwsp_count],f);
4525 nkf_xfree(lwsp_buf);
4529 if ((c3 = (*i_mgetc)(f))<=SP) {
4532 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4533 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4537 if ((c4 = (*i_mgetc)(f))<=SP) {
4540 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4541 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4545 mime_decode_mode = mode; /* still in MIME sigh... */
4547 /* BASE 64 decoding */
4549 t1 = 0x3f & base64decode(c1);
4550 t2 = 0x3f & base64decode(c2);
4551 t3 = 0x3f & base64decode(c3);
4552 t4 = 0x3f & base64decode(c4);
4553 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4555 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4556 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4558 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4559 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4561 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4566 return mime_input_buf(mime_input_state.top++);
4569 static const char basis_64[] =
4570 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4572 #define MIMEOUT_BUF_LENGTH 74
4574 char buf[MIMEOUT_BUF_LENGTH+1];
4578 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4581 open_mime(nkf_char mode)
4583 const unsigned char *p;
4586 p = mime_pattern[0];
4587 for(i=0;mime_pattern[i];i++) {
4588 if (mode == mime_encode[i]) {
4589 p = mime_pattern[i];
4593 mimeout_mode = mime_encode_method[i];
4595 if (base64_count>45) {
4596 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4597 (*o_mputc)(mimeout_state.buf[i]);
4600 PUT_NEWLINE((*o_mputc));
4603 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
4607 for (;i<mimeout_state.count;i++) {
4608 if (nkf_isspace(mimeout_state.buf[i])) {
4609 (*o_mputc)(mimeout_state.buf[i]);
4619 j = mimeout_state.count;
4620 mimeout_state.count = 0;
4622 mime_putc(mimeout_state.buf[i]);
4627 mime_prechar(nkf_char c2, nkf_char c1)
4629 if (mimeout_mode > 0){
4631 if (base64_count + mimeout_state.count/3*4> 73){
4632 (*o_base64conv)(EOF,0);
4633 OCONV_NEWLINE((*o_base64conv));
4634 (*o_base64conv)(0,SP);
4638 if (base64_count + mimeout_state.count/3*4> 66) {
4639 (*o_base64conv)(EOF,0);
4640 OCONV_NEWLINE((*o_base64conv));
4641 (*o_base64conv)(0,SP);
4647 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4648 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4649 open_mime(output_mode);
4650 (*o_base64conv)(EOF,0);
4651 OCONV_NEWLINE((*o_base64conv));
4652 (*o_base64conv)(0,SP);
4671 switch(mimeout_mode) {
4676 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
4682 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
4687 if (mimeout_mode > 0) {
4688 if (mimeout_f!=FIXED_MIME) {
4690 } else if (mimeout_mode != 'Q')
4696 mimeout_addchar(nkf_char c)
4698 switch(mimeout_mode) {
4703 } else if(!nkf_isalnum(c)) {
4705 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4706 (*o_mputc)(bin2hex((c&0xf)));
4714 nkf_state->mimeout_state=c;
4715 (*o_mputc)(basis_64[c>>2]);
4720 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4721 nkf_state->mimeout_state=c;
4726 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4727 (*o_mputc)(basis_64[c & 0x3F]);
4739 mime_putc(nkf_char c)
4744 if (mimeout_f == FIXED_MIME){
4745 if (mimeout_mode == 'Q'){
4746 if (base64_count > 71){
4747 if (c!=CR && c!=LF) {
4749 PUT_NEWLINE((*o_mputc));
4754 if (base64_count > 71){
4756 PUT_NEWLINE((*o_mputc));
4759 if (c == EOF) { /* c==EOF */
4763 if (c != EOF) { /* c==EOF */
4769 /* mimeout_f != FIXED_MIME */
4771 if (c == EOF) { /* c==EOF */
4772 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4773 j = mimeout_state.count;
4774 mimeout_state.count = 0;
4776 if (mimeout_mode > 0) {
4777 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4779 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4782 mimeout_addchar(mimeout_state.buf[i]);
4786 mimeout_addchar(mimeout_state.buf[i]);
4790 mimeout_addchar(mimeout_state.buf[i]);
4796 mimeout_addchar(mimeout_state.buf[i]);
4802 if (mimeout_state.count > 0){
4803 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4808 if (mimeout_mode=='Q') {
4809 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4810 if (c == CR || c == LF) {
4815 } else if (c <= SP) {
4817 if (base64_count > 70) {
4818 PUT_NEWLINE((*o_mputc));
4821 if (!nkf_isblank(c)) {
4826 if (base64_count > 70) {
4828 PUT_NEWLINE((*o_mputc));
4831 open_mime(output_mode);
4833 if (!nkf_noescape_mime(c)) {
4844 if (mimeout_mode <= 0) {
4845 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4846 if (nkf_isspace(c)) {
4848 if (mimeout_mode == -1) {
4851 if (c==CR || c==LF) {
4853 open_mime(output_mode);
4859 for (i=0;i<mimeout_state.count;i++) {
4860 (*o_mputc)(mimeout_state.buf[i]);
4861 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4872 mimeout_state.buf[0] = (char)c;
4873 mimeout_state.count = 1;
4875 if (base64_count > 1
4876 && base64_count + mimeout_state.count > 76
4877 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4878 static const char *str = "boundary=\"";
4879 static int len = 10;
4882 for (; i < mimeout_state.count - len; ++i) {
4883 if (!strncmp(mimeout_state.buf+i, str, len)) {
4889 if (i == 0 || i == mimeout_state.count - len) {
4890 PUT_NEWLINE((*o_mputc));
4892 if (!nkf_isspace(mimeout_state.buf[0])){
4899 for (j = 0; j <= i; ++j) {
4900 (*o_mputc)(mimeout_state.buf[j]);
4902 PUT_NEWLINE((*o_mputc));
4904 for (; j <= mimeout_state.count; ++j) {
4905 mimeout_state.buf[j - i] = mimeout_state.buf[j];
4907 mimeout_state.count -= i;
4910 mimeout_state.buf[mimeout_state.count++] = (char)c;
4911 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4912 open_mime(output_mode);
4917 if (lastchar==CR || lastchar == LF){
4918 for (i=0;i<mimeout_state.count;i++) {
4919 (*o_mputc)(mimeout_state.buf[i]);
4922 mimeout_state.count = 0;
4925 for (i=0;i<mimeout_state.count-1;i++) {
4926 (*o_mputc)(mimeout_state.buf[i]);
4929 mimeout_state.buf[0] = SP;
4930 mimeout_state.count = 1;
4932 open_mime(output_mode);
4935 /* mimeout_mode == 'B', 1, 2 */
4936 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4937 if (lastchar == CR || lastchar == LF){
4938 if (nkf_isblank(c)) {
4939 for (i=0;i<mimeout_state.count;i++) {
4940 mimeout_addchar(mimeout_state.buf[i]);
4942 mimeout_state.count = 0;
4943 } else if (SP<c && c<DEL) {
4945 for (i=0;i<mimeout_state.count;i++) {
4946 (*o_mputc)(mimeout_state.buf[i]);
4949 mimeout_state.count = 0;
4951 mimeout_state.buf[mimeout_state.count++] = (char)c;
4954 if (nkf_isspace(c)) {
4955 for (i=0;i<mimeout_state.count;i++) {
4956 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4958 for (i=0;i<mimeout_state.count;i++) {
4959 (*o_mputc)(mimeout_state.buf[i]);
4962 mimeout_state.count = 0;
4965 mimeout_state.buf[mimeout_state.count++] = (char)c;
4966 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4968 for (i=0;i<mimeout_state.count;i++) {
4969 (*o_mputc)(mimeout_state.buf[i]);
4972 mimeout_state.count = 0;
4976 if (mimeout_state.count>0 && SP<c && c!='=') {
4977 mimeout_state.buf[mimeout_state.count++] = (char)c;
4978 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4979 j = mimeout_state.count;
4980 mimeout_state.count = 0;
4982 mimeout_addchar(mimeout_state.buf[i]);
4989 if (mimeout_state.count>0) {
4990 j = mimeout_state.count;
4991 mimeout_state.count = 0;
4993 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
4995 mimeout_addchar(mimeout_state.buf[i]);
5001 (*o_mputc)(mimeout_state.buf[i]);
5003 open_mime(output_mode);
5010 base64_conv(nkf_char c2, nkf_char c1)
5012 mime_prechar(c2, c1);
5013 (*o_base64conv)(c2,c1);
5017 typedef struct nkf_iconv_t {
5020 size_t input_buffer_size;
5021 char *output_buffer;
5022 size_t output_buffer_size;
5026 nkf_iconv_new(char *tocode, char *fromcode)
5028 nkf_iconv_t converter;
5030 converter->input_buffer_size = IOBUF_SIZE;
5031 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5032 converter->output_buffer_size = IOBUF_SIZE * 2;
5033 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5034 converter->cd = iconv_open(tocode, fromcode);
5035 if (converter->cd == (iconv_t)-1)
5039 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5042 perror("can't iconv_open");
5048 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5050 size_t invalid = (size_t)0;
5051 char *input_buffer = converter->input_buffer;
5052 size_t input_length = (size_t)0;
5053 char *output_buffer = converter->output_buffer;
5054 size_t output_length = converter->output_buffer_size;
5059 while ((c = (*i_getc)(f)) != EOF) {
5060 input_buffer[input_length++] = c;
5061 if (input_length < converter->input_buffer_size) break;
5065 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5066 while (output_length-- > 0) {
5067 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5069 if (ret == (size_t) - 1) {
5072 if (input_buffer != converter->input_buffer)
5073 memmove(converter->input_buffer, input_buffer, input_length);
5076 converter->output_buffer_size *= 2;
5077 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5078 if (output_buffer == NULL) {
5079 perror("can't realloc");
5082 converter->output_buffer = output_buffer;
5085 perror("can't iconv");
5098 nkf_iconv_close(nkf_iconv_t *convert)
5100 nkf_xfree(converter->inbuf);
5101 nkf_xfree(converter->outbuf);
5102 iconv_close(converter->cd);
5111 struct input_code *p = input_code_list;
5123 mime_f = MIME_DECODE_DEFAULT;
5124 mime_decode_f = FALSE;
5129 x0201_f = X0201_DEFAULT;
5130 iso2022jp_f = FALSE;
5131 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5132 ms_ucs_map_f = UCS_MAP_ASCII;
5134 #ifdef UTF8_INPUT_ENABLE
5135 no_cp932ext_f = FALSE;
5136 no_best_fit_chars_f = FALSE;
5137 encode_fallback = NULL;
5138 unicode_subchar = '?';
5139 input_endian = ENDIAN_BIG;
5141 #ifdef UTF8_OUTPUT_ENABLE
5142 output_bom_f = FALSE;
5143 output_endian = ENDIAN_BIG;
5145 #ifdef UNICODE_NORMALIZATION
5161 #ifdef SHIFTJIS_CP932
5171 for (i = 0; i < 256; i++){
5172 prefix_table[i] = 0;
5176 mimeout_state.count = 0;
5181 fold_preserve_f = FALSE;
5184 kanji_intro = DEFAULT_J;
5185 ascii_intro = DEFAULT_R;
5186 fold_margin = FOLD_MARGIN;
5187 o_zconv = no_connection;
5188 o_fconv = no_connection;
5189 o_eol_conv = no_connection;
5190 o_rot_conv = no_connection;
5191 o_hira_conv = no_connection;
5192 o_base64conv = no_connection;
5193 o_iso2022jp_check_conv = no_connection;
5196 i_ungetc = std_ungetc;
5198 i_bungetc = std_ungetc;
5201 i_mungetc = std_ungetc;
5202 i_mgetc_buf = std_getc;
5203 i_mungetc_buf = std_ungetc;
5204 output_mode = ASCII;
5206 mime_decode_mode = FALSE;
5212 z_prev2=0,z_prev1=0;
5214 iconv_for_check = 0;
5216 input_codename = NULL;
5217 input_encoding = NULL;
5218 output_encoding = NULL;
5226 module_connection(void)
5228 if (input_encoding) set_input_encoding(input_encoding);
5229 if (!output_encoding) {
5230 output_encoding = nkf_default_encoding();
5232 if (!output_encoding) {
5233 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5236 set_output_encoding(output_encoding);
5237 oconv = nkf_enc_to_oconv(output_encoding);
5240 /* replace continucation module, from output side */
5242 /* output redicrection */
5244 if (noout_f || guess_f){
5251 if (mimeout_f == TRUE) {
5252 o_base64conv = oconv; oconv = base64_conv;
5254 /* base64_count = 0; */
5257 if (eolmode_f || guess_f) {
5258 o_eol_conv = oconv; oconv = eol_conv;
5261 o_rot_conv = oconv; oconv = rot_conv;
5264 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5267 o_hira_conv = oconv; oconv = hira_conv;
5270 o_fconv = oconv; oconv = fold_conv;
5273 if (alpha_f || x0201_f) {
5274 o_zconv = oconv; oconv = z_conv;
5278 i_ungetc = std_ungetc;
5279 /* input redicrection */
5282 i_cgetc = i_getc; i_getc = cap_getc;
5283 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5286 i_ugetc = i_getc; i_getc = url_getc;
5287 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5290 #ifdef NUMCHAR_OPTION
5292 i_ngetc = i_getc; i_getc = numchar_getc;
5293 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5296 #ifdef UNICODE_NORMALIZATION
5298 i_nfc_getc = i_getc; i_getc = nfc_getc;
5299 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5302 if (mime_f && mimebuf_f==FIXED_MIME) {
5303 i_mgetc = i_getc; i_getc = mime_getc;
5304 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5307 i_bgetc = i_getc; i_getc = broken_getc;
5308 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5310 if (input_encoding) {
5311 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5313 set_iconv(FALSE, e_iconv);
5317 struct input_code *p = input_code_list;
5326 Conversion main loop. Code detection only.
5329 #if !defined(PERL_XS) && !defined(WIN32DLL)
5336 module_connection();
5337 while ((c = (*i_getc)(f)) != EOF)
5344 #define NEXT continue /* no output, get next */
5345 #define SKIP c2=0;continue /* no output, get next */
5346 #define MORE c2=c1;continue /* need one more byte */
5347 #define SEND ; /* output c1 and c2, get next */
5348 #define LAST break /* end of loop, go closing */
5349 #define set_input_mode(mode) do { \
5350 input_mode = mode; \
5352 set_input_codename("ISO-2022-JP"); \
5353 debug("ISO-2022-JP"); \
5357 kanji_convert(FILE *f)
5359 nkf_char c1=0, c2=0, c3=0, c4=0;
5360 int shift_mode = 0; /* 0, 1, 2, 3 */
5362 int is_8bit = FALSE;
5364 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5369 output_mode = ASCII;
5371 if (module_connection() < 0) {
5372 #if !defined(PERL_XS) && !defined(WIN32DLL)
5373 fprintf(stderr, "no output encoding given\n");
5379 #ifdef UTF8_INPUT_ENABLE
5380 if(iconv == w_iconv32){
5381 while ((c1 = (*i_getc)(f)) != EOF &&
5382 (c2 = (*i_getc)(f)) != EOF &&
5383 (c3 = (*i_getc)(f)) != EOF &&
5384 (c4 = (*i_getc)(f)) != EOF) {
5385 nkf_iconv_utf_32(c1, c2, c3, c4);
5387 (*i_ungetc)(EOF, f);
5389 else if (iconv == w_iconv16) {
5390 while ((c1 = (*i_getc)(f)) != EOF &&
5391 (c2 = (*i_getc)(f)) != EOF) {
5392 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5393 (c3 = (*i_getc)(f)) != EOF &&
5394 (c4 = (*i_getc)(f)) != EOF) {
5395 nkf_iconv_utf_16(c1, c2, c3, c4);
5398 (*i_ungetc)(EOF, f);
5402 while ((c1 = (*i_getc)(f)) != EOF) {
5403 #ifdef INPUT_CODE_FIX
5404 if (!input_encoding)
5410 /* in case of 8th bit is on */
5411 if (!estab_f&&!mime_decode_mode) {
5412 /* in case of not established yet */
5413 /* It is still ambiguious */
5414 if (h_conv(f, c2, c1)==EOF) {
5422 /* in case of already established */
5424 /* ignore bogus code */
5432 /* 2nd byte of 7 bit code or SJIS */
5436 else if (nkf_char_unicode_p(c1)) {
5442 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5445 } else if (c1 > DEL) {
5447 if (!estab_f && !iso8859_f) {
5448 /* not established yet */
5450 } else { /* estab_f==TRUE */
5456 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5457 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5459 c2 = JIS_X_0201_1976_K;
5464 /* already established */
5468 } else if (SP < c1 && c1 < DEL) {
5469 /* in case of Roman characters */
5471 /* output 1 shifted byte */
5475 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5476 /* output 1 shifted byte */
5477 c2 = JIS_X_0201_1976_K;
5480 /* look like bogus code */
5483 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5484 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5485 /* in case of Kanji shifted */
5487 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5488 /* Check MIME code */
5489 if ((c1 = (*i_getc)(f)) == EOF) {
5492 } else if (c1 == '?') {
5493 /* =? is mime conversion start sequence */
5494 if(mime_f == STRICT_MIME) {
5495 /* check in real detail */
5496 if (mime_begin_strict(f) == EOF)
5499 } else if (mime_begin(f) == EOF)
5508 /* normal ASCII code */
5511 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5514 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5517 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5518 if ((c1 = (*i_getc)(f)) == EOF) {
5519 /* (*oconv)(0, ESC); don't send bogus code */
5522 else if (c1 == '&') {
5524 if ((c1 = (*i_getc)(f)) == EOF) {
5530 else if (c1 == '$') {
5532 if ((c1 = (*i_getc)(f)) == EOF) {
5533 /* don't send bogus code
5535 (*oconv)(0, '$'); */
5537 } else if (c1 == '@' || c1 == 'B') {
5539 set_input_mode(JIS_X_0208);
5541 } else if (c1 == '(') {
5543 if ((c1 = (*i_getc)(f)) == EOF) {
5544 /* don't send bogus code
5550 } else if (c1 == '@'|| c1 == 'B') {
5552 set_input_mode(JIS_X_0208);
5555 } else if (c1 == 'D'){
5556 set_input_mode(JIS_X_0212);
5558 #endif /* X0212_ENABLE */
5559 } else if (c1 == 'O' || c1 == 'Q'){
5560 set_input_mode(JIS_X_0213_1);
5562 } else if (c1 == 'P'){
5563 set_input_mode(JIS_X_0213_2);
5566 /* could be some special code */
5573 } else if (broken_f&0x2) {
5574 /* accept any ESC-(-x as broken code ... */
5575 input_mode = JIS_X_0208;
5584 } else if (c1 == '(') {
5586 if ((c1 = (*i_getc)(f)) == EOF) {
5587 /* don't send bogus code
5589 (*oconv)(0, '('); */
5592 else if (c1 == 'I') {
5593 /* JIS X 0201 Katakana */
5594 set_input_mode(JIS_X_0201_1976_K);
5597 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5598 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5599 set_input_mode(ASCII);
5602 else if (broken_f&0x2) {
5603 set_input_mode(ASCII);
5612 else if (c1 == '.') {
5614 if ((c1 = (*i_getc)(f)) == EOF) {
5617 else if (c1 == 'A') {
5628 else if (c1 == 'N') {
5631 if (g2 == ISO_8859_1) {
5646 } else if (c1 == ESC && iconv == s_iconv) {
5647 /* ESC in Shift_JIS */
5648 if ((c1 = (*i_getc)(f)) == EOF) {
5649 /* (*oconv)(0, ESC); don't send bogus code */
5651 } else if (c1 == '$') {
5653 if ((c1 = (*i_getc)(f)) == EOF) {
5655 } else if (('E' <= c1 && c1 <= 'G') ||
5656 ('O' <= c1 && c1 <= 'Q')) {
5664 static const nkf_char jphone_emoji_first_table[7] =
5665 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5666 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5667 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5668 while (SP <= c1 && c1 <= 'z') {
5669 (*oconv)(0, c1 + c3);
5670 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5685 } else if (c1 == LF || c1 == CR) {
5687 input_mode = ASCII; set_iconv(FALSE, 0);
5689 } else if (mime_decode_f && !mime_decode_mode){
5691 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5699 } else { /* if (c1 == CR)*/
5700 if ((c1=(*i_getc)(f))!=EOF) {
5704 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5724 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5727 if ((c3 = (*i_getc)(f)) != EOF) {
5730 if ((c4 = (*i_getc)(f)) != EOF) {
5732 (*iconv)(c2, c1, c3|c4);
5737 /* 3 bytes EUC or UTF-8 */
5738 if ((c3 = (*i_getc)(f)) != EOF) {
5740 (*iconv)(c2, c1, c3);
5748 0x7F <= c2 && c2 <= 0x92 &&
5749 0x21 <= c1 && c1 <= 0x7E) {
5751 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5754 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5758 (*oconv)(PREFIX_EUCG3 | c2, c1);
5760 #endif /* X0212_ENABLE */
5762 (*oconv)(PREFIX_EUCG3 | c2, c1);
5765 (*oconv)(input_mode, c1); /* other special case */
5771 /* goto next_word */
5775 (*iconv)(EOF, 0, 0);
5776 if (!input_codename)
5779 struct input_code *p = input_code_list;
5780 struct input_code *result = p;
5782 if (p->score < result->score) result = p;
5785 set_input_codename(result->name);
5787 debug(result->name);
5795 * int options(unsigned char *cp)
5802 options(unsigned char *cp)
5806 unsigned char *cp_back = NULL;
5811 while(*cp && *cp++!='-');
5812 while (*cp || cp_back) {
5820 case '-': /* literal options */
5821 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5825 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5826 p = (unsigned char *)long_option[i].name;
5827 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5828 if (*p == cp[j] || cp[j] == SP){
5835 #if !defined(PERL_XS) && !defined(WIN32DLL)
5836 fprintf(stderr, "unknown long option: --%s\n", cp);
5840 while(*cp && *cp != SP && cp++);
5841 if (long_option[i].alias[0]){
5843 cp = (unsigned char *)long_option[i].alias;
5845 if (strcmp(long_option[i].name, "help") == 0){
5849 if (strcmp(long_option[i].name, "ic=") == 0){
5850 enc = nkf_enc_find((char *)p);
5852 input_encoding = enc;
5855 if (strcmp(long_option[i].name, "oc=") == 0){
5856 enc = nkf_enc_find((char *)p);
5857 /* if (enc <= 0) continue; */
5859 output_encoding = enc;
5862 if (strcmp(long_option[i].name, "guess=") == 0){
5863 if (p[0] == '0' || p[0] == '1') {
5871 if (strcmp(long_option[i].name, "overwrite") == 0){
5874 preserve_time_f = TRUE;
5877 if (strcmp(long_option[i].name, "overwrite=") == 0){
5880 preserve_time_f = TRUE;
5882 backup_suffix = (char *)p;
5885 if (strcmp(long_option[i].name, "in-place") == 0){
5888 preserve_time_f = FALSE;
5891 if (strcmp(long_option[i].name, "in-place=") == 0){
5894 preserve_time_f = FALSE;
5896 backup_suffix = (char *)p;
5901 if (strcmp(long_option[i].name, "cap-input") == 0){
5905 if (strcmp(long_option[i].name, "url-input") == 0){
5910 #ifdef NUMCHAR_OPTION
5911 if (strcmp(long_option[i].name, "numchar-input") == 0){
5917 if (strcmp(long_option[i].name, "no-output") == 0){
5921 if (strcmp(long_option[i].name, "debug") == 0){
5926 if (strcmp(long_option[i].name, "cp932") == 0){
5927 #ifdef SHIFTJIS_CP932
5931 #ifdef UTF8_OUTPUT_ENABLE
5932 ms_ucs_map_f = UCS_MAP_CP932;
5936 if (strcmp(long_option[i].name, "no-cp932") == 0){
5937 #ifdef SHIFTJIS_CP932
5941 #ifdef UTF8_OUTPUT_ENABLE
5942 ms_ucs_map_f = UCS_MAP_ASCII;
5946 #ifdef SHIFTJIS_CP932
5947 if (strcmp(long_option[i].name, "cp932inv") == 0){
5954 if (strcmp(long_option[i].name, "x0212") == 0){
5961 if (strcmp(long_option[i].name, "exec-in") == 0){
5965 if (strcmp(long_option[i].name, "exec-out") == 0){
5970 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5971 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5972 no_cp932ext_f = TRUE;
5975 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5976 no_best_fit_chars_f = TRUE;
5979 if (strcmp(long_option[i].name, "fb-skip") == 0){
5980 encode_fallback = NULL;
5983 if (strcmp(long_option[i].name, "fb-html") == 0){
5984 encode_fallback = encode_fallback_html;
5987 if (strcmp(long_option[i].name, "fb-xml") == 0){
5988 encode_fallback = encode_fallback_xml;
5991 if (strcmp(long_option[i].name, "fb-java") == 0){
5992 encode_fallback = encode_fallback_java;
5995 if (strcmp(long_option[i].name, "fb-perl") == 0){
5996 encode_fallback = encode_fallback_perl;
5999 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6000 encode_fallback = encode_fallback_subchar;
6003 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6004 encode_fallback = encode_fallback_subchar;
6005 unicode_subchar = 0;
6007 /* decimal number */
6008 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6009 unicode_subchar *= 10;
6010 unicode_subchar += hex2bin(p[i]);
6012 }else if(p[1] == 'x' || p[1] == 'X'){
6013 /* hexadecimal number */
6014 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6015 unicode_subchar <<= 4;
6016 unicode_subchar |= hex2bin(p[i]);
6020 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6021 unicode_subchar *= 8;
6022 unicode_subchar += hex2bin(p[i]);
6025 w16e_conv(unicode_subchar, &i, &j);
6026 unicode_subchar = i<<8 | j;
6030 #ifdef UTF8_OUTPUT_ENABLE
6031 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6032 ms_ucs_map_f = UCS_MAP_MS;
6036 #ifdef UNICODE_NORMALIZATION
6037 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6042 if (strcmp(long_option[i].name, "prefix=") == 0){
6043 if (nkf_isgraph(p[0])){
6044 for (i = 1; nkf_isgraph(p[i]); i++){
6045 prefix_table[p[i]] = p[0];
6050 #if !defined(PERL_XS) && !defined(WIN32DLL)
6051 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6056 case 'b': /* buffered mode */
6059 case 'u': /* non bufferd mode */
6062 case 't': /* transparent mode */
6067 } else if (*cp=='2') {
6071 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6079 case 'j': /* JIS output */
6081 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6083 case 'e': /* AT&T EUC output */
6084 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6086 case 's': /* SJIS output */
6087 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6089 case 'l': /* ISO8859 Latin-1 support, no conversion */
6090 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6091 input_encoding = nkf_enc_from_index(ISO_8859_1);
6093 case 'i': /* Kanji IN ESC-$-@/B */
6094 if (*cp=='@'||*cp=='B')
6095 kanji_intro = *cp++;
6097 case 'o': /* ASCII IN ESC-(-J/B/H */
6098 /* ESC ( H was used in initial JUNET messages */
6099 if (*cp=='J'||*cp=='B'||*cp=='H')
6100 ascii_intro = *cp++;
6104 bit:1 katakana->hiragana
6105 bit:2 hiragana->katakana
6107 if ('9'>= *cp && *cp>='0')
6108 hira_f |= (*cp++ -'0');
6115 #if defined(MSDOS) || defined(__OS2__)
6122 show_configuration();
6130 #ifdef UTF8_OUTPUT_ENABLE
6131 case 'w': /* UTF-8 output */
6136 output_encoding = nkf_enc_from_index(UTF_8N);
6138 output_bom_f = TRUE;
6139 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6143 if ('1'== cp[0] && '6'==cp[1]) {
6146 } else if ('3'== cp[0] && '2'==cp[1]) {
6150 output_encoding = nkf_enc_from_index(UTF_8);
6155 output_endian = ENDIAN_LITTLE;
6156 } else if (cp[0] == 'B') {
6159 output_encoding = nkf_enc_from_index(enc_idx);
6164 enc_idx = enc_idx == UTF_16
6165 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6166 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6168 output_bom_f = TRUE;
6169 enc_idx = enc_idx == UTF_16
6170 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6171 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6173 output_encoding = nkf_enc_from_index(enc_idx);
6177 #ifdef UTF8_INPUT_ENABLE
6178 case 'W': /* UTF input */
6181 input_encoding = nkf_enc_from_index(UTF_8);
6184 if ('1'== cp[0] && '6'==cp[1]) {
6186 input_endian = ENDIAN_BIG;
6188 } else if ('3'== cp[0] && '2'==cp[1]) {
6190 input_endian = ENDIAN_BIG;
6193 input_encoding = nkf_enc_from_index(UTF_8);
6198 input_endian = ENDIAN_LITTLE;
6199 } else if (cp[0] == 'B') {
6201 input_endian = ENDIAN_BIG;
6203 enc_idx = (enc_idx == UTF_16
6204 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6205 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6206 input_encoding = nkf_enc_from_index(enc_idx);
6210 /* Input code assumption */
6211 case 'J': /* ISO-2022-JP input */
6212 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6214 case 'E': /* EUC-JP input */
6215 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6217 case 'S': /* Shift_JIS input */
6218 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6220 case 'Z': /* Convert X0208 alphabet to asii */
6222 bit:0 Convert JIS X 0208 Alphabet to ASCII
6223 bit:1 Convert Kankaku to one space
6224 bit:2 Convert Kankaku to two spaces
6225 bit:3 Convert HTML Entity
6226 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6228 while ('0'<= *cp && *cp <='9') {
6229 alpha_f |= 1 << (*cp++ - '0');
6231 if (!alpha_f) alpha_f = 1;
6233 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6234 x0201_f = FALSE; /* No X0201->X0208 conversion */
6236 ESC-(-I in JIS, EUC, MS Kanji
6237 SI/SO in JIS, EUC, MS Kanji
6238 SS2 in EUC, JIS, not in MS Kanji
6239 MS Kanji (0xa0-0xdf)
6241 ESC-(-I in JIS (0x20-0x5f)
6242 SS2 in EUC (0xa0-0xdf)
6243 0xa0-0xd in MS Kanji (0xa0-0xdf)
6246 case 'X': /* Convert X0201 kana to X0208 */
6249 case 'F': /* prserve new lines */
6250 fold_preserve_f = TRUE;
6251 case 'f': /* folding -f60 or -f */
6254 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6256 fold_len += *cp++ - '0';
6258 if (!(0<fold_len && fold_len<BUFSIZ))
6259 fold_len = DEFAULT_FOLD;
6263 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6265 fold_margin += *cp++ - '0';
6269 case 'm': /* MIME support */
6270 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6271 if (*cp=='B'||*cp=='Q') {
6272 mime_decode_mode = *cp++;
6273 mimebuf_f = FIXED_MIME;
6274 } else if (*cp=='N') {
6275 mime_f = TRUE; cp++;
6276 } else if (*cp=='S') {
6277 mime_f = STRICT_MIME; cp++;
6278 } else if (*cp=='0') {
6279 mime_decode_f = FALSE;
6280 mime_f = FALSE; cp++;
6282 mime_f = STRICT_MIME;
6285 case 'M': /* MIME output */
6288 mimeout_f = FIXED_MIME; cp++;
6289 } else if (*cp=='Q') {
6291 mimeout_f = FIXED_MIME; cp++;
6296 case 'B': /* Broken JIS support */
6298 bit:1 allow any x on ESC-(-x or ESC-$-x
6299 bit:2 reset to ascii on NL
6301 if ('9'>= *cp && *cp>='0')
6302 broken_f |= 1<<(*cp++ -'0');
6307 case 'O':/* for Output file */
6311 case 'c':/* add cr code */
6314 case 'd':/* delete cr code */
6317 case 'I': /* ISO-2022-JP output */
6320 case 'L': /* line mode */
6321 if (*cp=='u') { /* unix */
6322 eolmode_f = LF; cp++;
6323 } else if (*cp=='m') { /* mac */
6324 eolmode_f = CR; cp++;
6325 } else if (*cp=='w') { /* windows */
6326 eolmode_f = CRLF; cp++;
6327 } else if (*cp=='0') { /* no conversion */
6328 eolmode_f = 0; cp++;
6333 if ('2' <= *cp && *cp <= '9') {
6336 } else if (*cp == '0' || *cp == '1') {
6345 /* module muliple options in a string are allowed for Perl moudle */
6346 while(*cp && *cp++!='-');
6349 #if !defined(PERL_XS) && !defined(WIN32DLL)
6350 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6352 /* bogus option but ignored */
6360 #include "nkf32dll.c"
6361 #elif defined(PERL_XS)
6362 #else /* WIN32DLL */
6364 main(int argc, char **argv)
6369 char *outfname = NULL;
6372 #ifdef EASYWIN /*Easy Win */
6373 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6375 #ifdef DEFAULT_CODE_LOCALE
6376 setlocale(LC_CTYPE, "");
6380 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6381 cp = (unsigned char *)*argv;
6386 if (pipe(fds) < 0 || (pid = fork()) < 0){
6397 execvp(argv[1], &argv[1]);
6414 int debug_f_back = debug_f;
6417 int exec_f_back = exec_f;
6420 int x0212_f_back = x0212_f;
6422 int x0213_f_back = x0213_f;
6423 int guess_f_back = guess_f;
6425 guess_f = guess_f_back;
6428 debug_f = debug_f_back;
6431 exec_f = exec_f_back;
6433 x0212_f = x0212_f_back;
6434 x0213_f = x0213_f_back;
6437 if (binmode_f == TRUE)
6438 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6439 if (freopen("","wb",stdout) == NULL)
6446 setbuf(stdout, (char *) NULL);
6448 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6451 if (binmode_f == TRUE)
6452 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6453 if (freopen("","rb",stdin) == NULL) return (-1);
6457 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6461 kanji_convert(stdin);
6462 if (guess_f) print_guessed_code(NULL);
6466 int is_argument_error = FALSE;
6468 input_codename = NULL;
6471 iconv_for_check = 0;
6473 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6475 is_argument_error = TRUE;
6483 /* reopen file for stdout */
6484 if (file_out_f == TRUE) {
6487 outfname = nkf_xmalloc(strlen(origfname)
6488 + strlen(".nkftmpXXXXXX")
6490 strcpy(outfname, origfname);
6494 for (i = strlen(outfname); i; --i){
6495 if (outfname[i - 1] == '/'
6496 || outfname[i - 1] == '\\'){
6502 strcat(outfname, "ntXXXXXX");
6504 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6505 S_IREAD | S_IWRITE);
6507 strcat(outfname, ".nkftmpXXXXXX");
6508 fd = mkstemp(outfname);
6511 || (fd_backup = dup(fileno(stdout))) < 0
6512 || dup2(fd, fileno(stdout)) < 0
6523 outfname = "nkf.out";
6526 if(freopen(outfname, "w", stdout) == NULL) {
6530 if (binmode_f == TRUE) {
6531 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6532 if (freopen("","wb",stdout) == NULL)
6539 if (binmode_f == TRUE)
6540 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6541 if (freopen("","rb",fin) == NULL)
6546 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6550 char *filename = NULL;
6552 if (nfiles > 1) filename = origfname;
6553 if (guess_f) print_guessed_code(filename);
6559 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6567 if (dup2(fd_backup, fileno(stdout)) < 0){
6570 if (stat(origfname, &sb)) {
6571 fprintf(stderr, "Can't stat %s\n", origfname);
6573 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6574 if (chmod(outfname, sb.st_mode)) {
6575 fprintf(stderr, "Can't set permission %s\n", outfname);
6578 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6579 if(preserve_time_f){
6580 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6581 tb[0] = tb[1] = sb.st_mtime;
6582 if (utime(outfname, tb)) {
6583 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6586 tb.actime = sb.st_atime;
6587 tb.modtime = sb.st_mtime;
6588 if (utime(outfname, &tb)) {
6589 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6594 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6596 unlink(backup_filename);
6598 if (rename(origfname, backup_filename)) {
6599 perror(backup_filename);
6600 fprintf(stderr, "Can't rename %s to %s\n",
6601 origfname, backup_filename);
6603 nkf_xfree(backup_filename);
6606 if (unlink(origfname)){
6611 if (rename(outfname, origfname)) {
6613 fprintf(stderr, "Can't rename %s to %s\n",
6614 outfname, origfname);
6616 nkf_xfree(outfname);
6621 if (is_argument_error)
6624 #ifdef EASYWIN /*Easy Win */
6625 if (file_out_f == FALSE)
6626 scanf("%d",&end_check);
6629 #else /* for Other OS */
6630 if (file_out_f == TRUE)
6632 #endif /*Easy Win */
6635 #endif /* WIN32DLL */