2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2009, The nkf Project.
5 * Permission is hereby granted, free of charge, to any person obtaining a copy
6 * of this software and associated documentation files (the "Software"), to deal
7 * in the Software without restriction, including without limitation the rights
8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 * copies of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 #define NKF_VERSION "2.0.8"
24 #define NKF_RELEASE_DATE "2009-01-05"
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2009, The nkf Project."
38 # define INCL_DOSERRORS
44 /* state of output_mode and input_mode
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
164 const nkf_native_encoding *base_encoding;
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
210 } encoding_name_to_id_table[] = {
213 {"ISO-2022-JP", ISO_2022_JP},
214 {"ISO2022JP-CP932", CP50220},
215 {"CP50220", CP50220},
216 {"CP50221", CP50221},
217 {"CSISO2022JP", CP50221},
218 {"CP50222", CP50222},
219 {"ISO-2022-JP-1", ISO_2022_JP_1},
220 {"ISO-2022-JP-3", ISO_2022_JP_3},
221 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
222 {"SHIFT_JIS", SHIFT_JIS},
224 {"WINDOWS-31J", WINDOWS_31J},
225 {"CSWINDOWS31J", WINDOWS_31J},
226 {"CP932", WINDOWS_31J},
227 {"MS932", WINDOWS_31J},
228 {"CP10001", CP10001},
231 {"EUCJP-NKF", EUCJP_NKF},
232 {"CP51932", CP51932},
233 {"EUC-JP-MS", EUCJP_MS},
234 {"EUCJP-MS", EUCJP_MS},
235 {"EUCJPMS", EUCJP_MS},
236 {"EUC-JP-ASCII", EUCJP_ASCII},
237 {"EUCJP-ASCII", EUCJP_ASCII},
238 {"SHIFT_JISX0213", SHIFT_JISX0213},
239 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
240 {"EUC-JISX0213", EUC_JISX0213},
241 {"EUC-JIS-2004", EUC_JIS_2004},
244 {"UTF-8-BOM", UTF_8_BOM},
245 {"UTF8-MAC", UTF8_MAC},
246 {"UTF-8-MAC", UTF8_MAC},
248 {"UTF-16BE", UTF_16BE},
249 {"UTF-16BE-BOM", UTF_16BE_BOM},
250 {"UTF-16LE", UTF_16LE},
251 {"UTF-16LE-BOM", UTF_16LE_BOM},
253 {"UTF-32BE", UTF_32BE},
254 {"UTF-32BE-BOM", UTF_32BE_BOM},
255 {"UTF-32LE", UTF_32LE},
256 {"UTF-32LE-BOM", UTF_32LE_BOM},
261 #if defined(DEFAULT_CODE_JIS)
262 #define DEFAULT_ENCIDX ISO_2022_JP
263 #elif defined(DEFAULT_CODE_SJIS)
264 #define DEFAULT_ENCIDX SHIFT_JIS
265 #elif defined(DEFAULT_CODE_WINDOWS_31J)
266 #define DEFAULT_ENCIDX WINDOWS_31J
267 #elif defined(DEFAULT_CODE_EUC)
268 #define DEFAULT_ENCIDX EUC_JP
269 #elif defined(DEFAULT_CODE_UTF8)
270 #define DEFAULT_ENCIDX UTF_8
274 #define is_alnum(c) \
275 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
277 /* I don't trust portablity of toupper */
278 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
279 #define nkf_isoctal(c) ('0'<=c && c<='7')
280 #define nkf_isdigit(c) ('0'<=c && c<='9')
281 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
282 #define nkf_isblank(c) (c == SP || c == TAB)
283 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
284 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
285 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
286 #define nkf_isprint(c) (SP<=c && c<='~')
287 #define nkf_isgraph(c) ('!'<=c && c<='~')
288 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
289 ('A'<=c&&c<='F') ? (c-'A'+10) : \
290 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
291 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
292 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
293 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
294 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
295 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
297 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
298 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
300 #define HOLD_SIZE 1024
301 #if defined(INT_IS_SHORT)
302 #define IOBUF_SIZE 2048
304 #define IOBUF_SIZE 16384
307 #define DEFAULT_J 'B'
308 #define DEFAULT_R 'B'
315 /* MIME preprocessor */
317 #ifdef EASYWIN /*Easy Win */
318 extern POINT _BufferSize;
327 void (*status_func)(struct input_code *, nkf_char);
328 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
332 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
333 static nkf_encoding *input_encoding = NULL;
334 static nkf_encoding *output_encoding = NULL;
336 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
338 * 0: Shift_JIS, eucJP-ascii
343 #define UCS_MAP_ASCII 0
345 #define UCS_MAP_CP932 2
346 #define UCS_MAP_CP10001 3
347 static int ms_ucs_map_f = UCS_MAP_ASCII;
349 #ifdef UTF8_INPUT_ENABLE
350 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
351 static int no_cp932ext_f = FALSE;
352 /* ignore ZERO WIDTH NO-BREAK SPACE */
353 static int no_best_fit_chars_f = FALSE;
354 static int input_endian = ENDIAN_BIG;
355 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
356 static void (*encode_fallback)(nkf_char c) = NULL;
357 static void w_status(struct input_code *, nkf_char);
359 #ifdef UTF8_OUTPUT_ENABLE
360 static int output_bom_f = FALSE;
361 static int output_endian = ENDIAN_BIG;
364 static void std_putc(nkf_char c);
365 static nkf_char std_getc(FILE *f);
366 static nkf_char std_ungetc(nkf_char c,FILE *f);
368 static nkf_char broken_getc(FILE *f);
369 static nkf_char broken_ungetc(nkf_char c,FILE *f);
371 static nkf_char mime_getc(FILE *f);
373 static void mime_putc(nkf_char c);
377 #if !defined(PERL_XS) && !defined(WIN32DLL)
378 static unsigned char stdibuf[IOBUF_SIZE];
379 static unsigned char stdobuf[IOBUF_SIZE];
383 static int unbuf_f = FALSE;
384 static int estab_f = FALSE;
385 static int nop_f = FALSE;
386 static int binmode_f = TRUE; /* binary mode */
387 static int rot_f = FALSE; /* rot14/43 mode */
388 static int hira_f = FALSE; /* hira/kata henkan */
389 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
390 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
391 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
392 static int mimebuf_f = FALSE; /* MIME buffered input */
393 static int broken_f = FALSE; /* convert ESC-less broken JIS */
394 static int iso8859_f = FALSE; /* ISO8859 through */
395 static int mimeout_f = FALSE; /* base64 mode */
396 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
397 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
399 #ifdef UNICODE_NORMALIZATION
400 static int nfc_f = FALSE;
401 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
402 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
406 static int cap_f = FALSE;
407 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
408 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
410 static int url_f = FALSE;
411 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
412 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
415 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
416 #define CLASS_MASK NKF_INT32_C(0xFF000000)
417 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
418 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
419 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
420 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
421 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
422 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
423 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
424 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
425 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
427 #ifdef NUMCHAR_OPTION
428 static int numchar_f = FALSE;
429 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
430 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
434 static int noout_f = FALSE;
435 static void no_putc(nkf_char c);
436 static int debug_f = FALSE;
437 static void debug(const char *str);
438 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
441 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
442 static void set_input_codename(const char *codename);
445 static int exec_f = 0;
448 #ifdef SHIFTJIS_CP932
449 /* invert IBM extended characters to others */
450 static int cp51932_f = FALSE;
452 /* invert NEC-selected IBM extended characters to IBM extended characters */
453 static int cp932inv_f = TRUE;
455 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
456 #endif /* SHIFTJIS_CP932 */
458 static int x0212_f = FALSE;
459 static int x0213_f = FALSE;
461 static unsigned char prefix_table[256];
463 static void e_status(struct input_code *, nkf_char);
464 static void s_status(struct input_code *, nkf_char);
466 struct input_code input_code_list[] = {
467 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
468 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
469 #ifdef UTF8_INPUT_ENABLE
470 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
475 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
476 static int base64_count = 0;
478 /* X0208 -> ASCII converter */
481 static int f_line = 0; /* chars in line */
482 static int f_prev = 0;
483 static int fold_preserve_f = FALSE; /* preserve new lines */
484 static int fold_f = FALSE;
485 static int fold_len = 0;
488 static unsigned char kanji_intro = DEFAULT_J;
489 static unsigned char ascii_intro = DEFAULT_R;
493 #define FOLD_MARGIN 10
494 #define DEFAULT_FOLD 60
496 static int fold_margin = FOLD_MARGIN;
498 /* process default */
501 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
503 fprintf(stderr,"nkf internal module connection failure.\n");
509 no_connection(nkf_char c2, nkf_char c1)
511 no_connection2(c2,c1,0);
514 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
515 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
517 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
518 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
519 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
520 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
521 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
522 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
523 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
525 /* static redirections */
527 static void (*o_putc)(nkf_char c) = std_putc;
529 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
530 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
532 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
533 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
535 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
537 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
538 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
540 /* for strict mime */
541 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
542 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
545 static int output_mode = ASCII; /* output kanji mode */
546 static int input_mode = ASCII; /* input kanji mode */
547 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
549 /* X0201 / X0208 conversion tables */
551 /* X0201 kana conversion table */
553 static const unsigned char cv[]= {
554 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
555 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
556 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
557 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
558 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
559 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
560 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
561 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
562 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
563 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
564 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
565 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
566 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
567 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
568 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
569 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
573 /* X0201 kana conversion table for daguten */
575 static const unsigned char dv[]= {
576 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
577 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
578 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
579 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
580 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
581 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
582 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
583 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
584 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
585 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
586 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
587 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
594 /* X0201 kana conversion table for han-daguten */
596 static const unsigned char ev[]= {
597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
607 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
608 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 /* X0208 kigou conversion table */
617 /* 0x8140 - 0x819e */
618 static const unsigned char fv[] = {
620 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
621 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
622 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
624 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
625 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
626 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
627 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
628 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
630 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
636 static int option_mode = 0;
637 static int file_out_f = FALSE;
639 static int overwrite_f = FALSE;
640 static int preserve_time_f = FALSE;
641 static int backup_f = FALSE;
642 static char *backup_suffix = "";
645 static int eolmode_f = 0; /* CR, LF, CRLF */
646 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
647 static nkf_char prev_cr = 0; /* CR or 0 */
648 #ifdef EASYWIN /*Easy Win */
649 static int end_check;
652 #define STD_GC_BUFSIZE (256)
653 nkf_char std_gc_buf[STD_GC_BUFSIZE];
657 nkf_xmalloc(size_t size)
661 if (size == 0) size = 1;
665 perror("can't malloc");
673 nkf_xrealloc(void *ptr, size_t size)
675 if (size == 0) size = 1;
677 ptr = realloc(ptr, size);
679 perror("can't realloc");
686 #define nkf_xfree(ptr) free(ptr)
689 nkf_str_caseeql(const char *src, const char *target)
692 for (i = 0; src[i] && target[i]; i++) {
693 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
695 if (src[i] || target[i]) return FALSE;
700 nkf_enc_from_index(int idx)
702 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
705 return &nkf_encoding_table[idx];
709 nkf_enc_find_index(const char *name)
712 if (name[0] == 'X' && *(name+1) == '-') name += 2;
713 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
714 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
715 return encoding_name_to_id_table[i].id;
722 nkf_enc_find(const char *name)
725 idx = nkf_enc_find_index(name);
726 if (idx < 0) return 0;
727 return nkf_enc_from_index(idx);
730 #define nkf_enc_name(enc) (enc)->name
731 #define nkf_enc_to_index(enc) (enc)->id
732 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
733 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
734 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
735 #define nkf_enc_asciicompat(enc) (\
736 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
737 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
738 #define nkf_enc_unicode_p(enc) (\
739 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
740 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
741 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
742 #define nkf_enc_cp5022x_p(enc) (\
743 nkf_enc_to_index(enc) == CP50220 ||\
744 nkf_enc_to_index(enc) == CP50221 ||\
745 nkf_enc_to_index(enc) == CP50222)
747 #ifdef DEFAULT_CODE_LOCALE
751 #ifdef HAVE_LANGINFO_H
752 return nl_langinfo(CODESET);
753 #elif defined(__WIN32__)
755 sprintf(buf, "CP%d", GetACP());
757 #elif defined(__OS2__)
758 # if defined(INT_IS_SHORT)
764 ULONG ulCP[1], ulncp;
765 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
766 if (ulCP[0] == 932 || ulCP[0] == 943)
767 strcpy(buf, "Shift_JIS");
769 sprintf(buf, "CP%lu", ulCP[0]);
777 nkf_locale_encoding()
779 nkf_encoding *enc = 0;
780 const char *encname = nkf_locale_charmap();
782 enc = nkf_enc_find(encname);
785 #endif /* DEFAULT_CODE_LOCALE */
790 return &nkf_encoding_table[UTF_8];
794 nkf_default_encoding()
796 nkf_encoding *enc = 0;
797 #ifdef DEFAULT_CODE_LOCALE
798 enc = nkf_locale_encoding();
799 #elif defined(DEFAULT_ENCIDX)
800 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
802 if (!enc) enc = nkf_utf8_encoding();
813 nkf_buf_new(int length)
815 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
816 buf->ptr = nkf_xmalloc(length);
823 nkf_buf_dispose(nkf_buf_t *buf)
829 #define nkf_buf_length(buf) ((buf)->len)
830 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
833 nkf_buf_at(nkf_buf_t *buf, int index)
835 assert(index <= buf->len);
836 return buf->ptr[index];
840 nkf_buf_clear(nkf_buf_t *buf)
846 nkf_buf_push(nkf_buf_t *buf, unsigned char c)
848 assert(buf->capa > buf->len);
849 buf->ptr[buf->len++] = c;
853 nkf_buf_pop(nkf_buf_t *buf)
855 assert(!nkf_buf_empty_p(buf));
856 return buf->ptr[--buf->len];
859 /* Normalization Form C */
862 #define fprintf dllprintf
868 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
875 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
877 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
878 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
879 #ifdef UTF8_OUTPUT_ENABLE
880 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
882 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
883 #ifdef UTF8_INPUT_ENABLE
884 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
889 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
890 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
891 "r {de/en}crypt ROT13/47\n"
892 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
893 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
894 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
895 "l ISO8859-1 (Latin-1) support\n"
896 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
899 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
900 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
901 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
902 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
903 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
907 "T Text mode output\n"
909 "O Output to File (DEFAULT 'nkf.out')\n"
910 "I Convert non ISO-2022-JP charactor to GETA\n"
911 "d,c Convert line breaks -d: LF -c: CRLF\n"
912 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
913 "v, V Show this usage. V: show configuration\n"
916 "Long name options\n"
917 " --ic=<input codeset> --oc=<output codeset>\n"
918 " Specify the input or output codeset\n"
919 " --fj --unix --mac --windows\n"
920 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
921 " Convert for the system or code\n"
922 " --hiragana --katakana --katakana-hiragana\n"
923 " To Hiragana/Katakana Conversion\n"
924 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
928 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
930 #ifdef NUMCHAR_OPTION
931 " --numchar-input Convert Unicode Character Reference\n"
933 #ifdef UTF8_INPUT_ENABLE
934 " --fb-{skip, html, xml, perl, java, subchar}\n"
935 " Specify how nkf handles unassigned characters\n"
940 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
941 " Overwrite original listed files by filtered result\n"
942 " --overwrite preserves timestamp of original files\n"
944 " -g --guess Guess the input code\n"
945 " --help --version Show this help/the version\n"
946 " For more information, see also man nkf\n"
952 show_configuration(void)
955 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
956 " Compile-time options:\n"
957 " Compiled at: " __DATE__ " " __TIME__ "\n"
960 " Default output encoding: "
961 #ifdef DEFAULT_CODE_LOCALE
962 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
963 #elif defined(DEFAULT_ENCIDX)
964 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
970 " Default output end of line: "
971 #if DEFAULT_NEWLINE == CR
973 #elif DEFAULT_NEWLINE == CRLF
979 " Decode MIME encoded string: "
980 #if MIME_DECODE_DEFAULT
986 " Convert JIS X 0201 Katakana: "
993 " --help, --version output: "
994 #if HELP_OUTPUT_HELP_OUTPUT
1005 get_backup_filename(const char *suffix, const char *filename)
1007 char *backup_filename;
1008 int asterisk_count = 0;
1010 int filename_length = strlen(filename);
1012 for(i = 0; suffix[i]; i++){
1013 if(suffix[i] == '*') asterisk_count++;
1017 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1018 for(i = 0, j = 0; suffix[i];){
1019 if(suffix[i] == '*'){
1020 backup_filename[j] = '\0';
1021 strncat(backup_filename, filename, filename_length);
1023 j += filename_length;
1025 backup_filename[j++] = suffix[i++];
1028 backup_filename[j] = '\0';
1030 j = filename_length + strlen(suffix);
1031 backup_filename = nkf_xmalloc(j + 1);
1032 strcpy(backup_filename, filename);
1033 strcat(backup_filename, suffix);
1034 backup_filename[j] = '\0';
1036 return backup_filename;
1040 #ifdef UTF8_INPUT_ENABLE
1042 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1049 (*f)(0, bin2hex(c>>shift));
1060 encode_fallback_html(nkf_char c)
1065 if(c >= NKF_INT32_C(1000000))
1066 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1067 if(c >= NKF_INT32_C(100000))
1068 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1070 (*oconv)(0, 0x30+(c/10000 )%10);
1072 (*oconv)(0, 0x30+(c/1000 )%10);
1074 (*oconv)(0, 0x30+(c/100 )%10);
1076 (*oconv)(0, 0x30+(c/10 )%10);
1078 (*oconv)(0, 0x30+ c %10);
1084 encode_fallback_xml(nkf_char c)
1089 nkf_each_char_to_hex(oconv, c);
1095 encode_fallback_java(nkf_char c)
1099 if(!nkf_char_unicode_bmp_p(c)){
1103 (*oconv)(0, bin2hex(c>>20));
1104 (*oconv)(0, bin2hex(c>>16));
1108 (*oconv)(0, bin2hex(c>>12));
1109 (*oconv)(0, bin2hex(c>> 8));
1110 (*oconv)(0, bin2hex(c>> 4));
1111 (*oconv)(0, bin2hex(c ));
1116 encode_fallback_perl(nkf_char c)
1121 nkf_each_char_to_hex(oconv, c);
1127 encode_fallback_subchar(nkf_char c)
1129 c = unicode_subchar;
1130 (*oconv)((c>>8)&0xFF, c&0xFF);
1135 static const struct {
1159 {"katakana-hiragana","h3"},
1167 #ifdef UTF8_OUTPUT_ENABLE
1177 {"fb-subchar=", ""},
1179 #ifdef UTF8_INPUT_ENABLE
1180 {"utf8-input", "W"},
1181 {"utf16-input", "W16"},
1182 {"no-cp932ext", ""},
1183 {"no-best-fit-chars",""},
1185 #ifdef UNICODE_NORMALIZATION
1186 {"utf8mac-input", ""},
1198 #ifdef NUMCHAR_OPTION
1199 {"numchar-input", ""},
1205 #ifdef SHIFTJIS_CP932
1216 set_input_encoding(nkf_encoding *enc)
1218 switch (nkf_enc_to_index(enc)) {
1225 #ifdef SHIFTJIS_CP932
1228 #ifdef UTF8_OUTPUT_ENABLE
1229 ms_ucs_map_f = UCS_MAP_CP932;
1239 case ISO_2022_JP_2004:
1246 #ifdef SHIFTJIS_CP932
1249 #ifdef UTF8_OUTPUT_ENABLE
1250 ms_ucs_map_f = UCS_MAP_CP932;
1255 #ifdef SHIFTJIS_CP932
1258 #ifdef UTF8_OUTPUT_ENABLE
1259 ms_ucs_map_f = UCS_MAP_CP10001;
1267 #ifdef SHIFTJIS_CP932
1270 #ifdef UTF8_OUTPUT_ENABLE
1271 ms_ucs_map_f = UCS_MAP_CP932;
1275 #ifdef SHIFTJIS_CP932
1278 #ifdef UTF8_OUTPUT_ENABLE
1279 ms_ucs_map_f = UCS_MAP_MS;
1283 #ifdef SHIFTJIS_CP932
1286 #ifdef UTF8_OUTPUT_ENABLE
1287 ms_ucs_map_f = UCS_MAP_ASCII;
1290 case SHIFT_JISX0213:
1291 case SHIFT_JIS_2004:
1293 #ifdef SHIFTJIS_CP932
1300 #ifdef SHIFTJIS_CP932
1304 #ifdef UTF8_INPUT_ENABLE
1305 #ifdef UNICODE_NORMALIZATION
1313 input_endian = ENDIAN_BIG;
1317 input_endian = ENDIAN_LITTLE;
1322 input_endian = ENDIAN_BIG;
1326 input_endian = ENDIAN_LITTLE;
1333 set_output_encoding(nkf_encoding *enc)
1335 switch (nkf_enc_to_index(enc)) {
1338 #ifdef SHIFTJIS_CP932
1339 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1341 #ifdef UTF8_OUTPUT_ENABLE
1342 ms_ucs_map_f = UCS_MAP_CP932;
1346 #ifdef SHIFTJIS_CP932
1347 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1349 #ifdef UTF8_OUTPUT_ENABLE
1350 ms_ucs_map_f = UCS_MAP_CP932;
1355 #ifdef SHIFTJIS_CP932
1356 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1362 #ifdef SHIFTJIS_CP932
1363 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1369 #ifdef UTF8_OUTPUT_ENABLE
1370 ms_ucs_map_f = UCS_MAP_CP932;
1374 #ifdef UTF8_OUTPUT_ENABLE
1375 ms_ucs_map_f = UCS_MAP_CP10001;
1380 #ifdef SHIFTJIS_CP932
1381 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1383 #ifdef UTF8_OUTPUT_ENABLE
1384 ms_ucs_map_f = UCS_MAP_ASCII;
1389 #ifdef SHIFTJIS_CP932
1390 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1392 #ifdef UTF8_OUTPUT_ENABLE
1393 ms_ucs_map_f = UCS_MAP_ASCII;
1397 #ifdef SHIFTJIS_CP932
1398 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1400 #ifdef UTF8_OUTPUT_ENABLE
1401 ms_ucs_map_f = UCS_MAP_CP932;
1406 #ifdef UTF8_OUTPUT_ENABLE
1407 ms_ucs_map_f = UCS_MAP_MS;
1412 #ifdef UTF8_OUTPUT_ENABLE
1413 ms_ucs_map_f = UCS_MAP_ASCII;
1416 case SHIFT_JISX0213:
1417 case SHIFT_JIS_2004:
1419 #ifdef SHIFTJIS_CP932
1420 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1427 #ifdef SHIFTJIS_CP932
1428 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1431 #ifdef UTF8_OUTPUT_ENABLE
1433 output_bom_f = TRUE;
1437 output_bom_f = TRUE;
1440 output_endian = ENDIAN_LITTLE;
1441 output_bom_f = FALSE;
1444 output_endian = ENDIAN_LITTLE;
1445 output_bom_f = TRUE;
1448 output_bom_f = TRUE;
1451 output_endian = ENDIAN_LITTLE;
1452 output_bom_f = FALSE;
1455 output_endian = ENDIAN_LITTLE;
1456 output_bom_f = TRUE;
1462 static struct input_code*
1463 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1466 struct input_code *p = input_code_list;
1468 if (iconv_func == p->iconv_func){
1478 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1480 #ifdef INPUT_CODE_FIX
1481 if (f || !input_encoding)
1488 #ifdef INPUT_CODE_FIX
1489 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1495 if (estab_f && iconv_for_check != iconv){
1496 struct input_code *p = find_inputcode_byfunc(iconv);
1498 set_input_codename(p->name);
1501 iconv_for_check = iconv;
1508 x0212_shift(nkf_char c)
1513 if (0x75 <= c && c <= 0x7f){
1514 ret = c + (0x109 - 0x75);
1517 if (0x75 <= c && c <= 0x7f){
1518 ret = c + (0x113 - 0x75);
1526 x0212_unshift(nkf_char c)
1529 if (0x7f <= c && c <= 0x88){
1530 ret = c + (0x75 - 0x7f);
1531 }else if (0x89 <= c && c <= 0x92){
1532 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1536 #endif /* X0212_ENABLE */
1539 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1545 if((0x21 <= ndx && ndx <= 0x2F)){
1546 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1547 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1549 }else if(0x6E <= ndx && ndx <= 0x7E){
1550 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1551 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1557 else if(nkf_isgraph(ndx)){
1559 const unsigned short *ptr;
1560 ptr = x0212_shiftjis[ndx - 0x21];
1562 val = ptr[(c1 & 0x7f) - 0x21];
1571 c2 = x0212_shift(c2);
1573 #endif /* X0212_ENABLE */
1575 if(0x7F < c2) return 1;
1576 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1577 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1582 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1584 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1587 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1588 if (0xFC < c1) return 1;
1589 #ifdef SHIFTJIS_CP932
1590 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1591 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1598 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1599 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1605 #endif /* SHIFTJIS_CP932 */
1607 if (!x0213_f && is_ibmext_in_sjis(c2)){
1608 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1611 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1624 if(x0213_f && c2 >= 0xF0){
1625 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1626 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1627 }else{ /* 78<=k<=94 */
1628 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1629 if (0x9E < c1) c2++;
1632 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1633 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1634 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1635 if (0x9E < c1) c2++;
1638 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1645 c2 = x0212_unshift(c2);
1652 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1654 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1662 }else if (val < 0x800){
1663 *p1 = 0xc0 | (val >> 6);
1664 *p2 = 0x80 | (val & 0x3f);
1667 } else if (nkf_char_unicode_bmp_p(val)) {
1668 *p1 = 0xe0 | (val >> 12);
1669 *p2 = 0x80 | ((val >> 6) & 0x3f);
1670 *p3 = 0x80 | ( val & 0x3f);
1672 } else if (nkf_char_unicode_value_p(val)) {
1673 *p1 = 0xe0 | (val >> 16);
1674 *p2 = 0x80 | ((val >> 12) & 0x3f);
1675 *p3 = 0x80 | ((val >> 6) & 0x3f);
1676 *p4 = 0x80 | ( val & 0x3f);
1686 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1693 else if (c1 <= 0xC3) {
1694 /* trail byte or invalid */
1697 else if (c1 <= 0xDF) {
1699 wc = (c1 & 0x1F) << 6;
1702 else if (c1 <= 0xEF) {
1704 wc = (c1 & 0x0F) << 12;
1705 wc |= (c2 & 0x3F) << 6;
1708 else if (c2 <= 0xF4) {
1710 wc = (c1 & 0x0F) << 18;
1711 wc |= (c2 & 0x3F) << 12;
1712 wc |= (c3 & 0x3F) << 6;
1722 #ifdef UTF8_INPUT_ENABLE
1724 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1725 const unsigned short *const *pp, nkf_char psize,
1726 nkf_char *p2, nkf_char *p1)
1729 const unsigned short *p;
1732 if (pp == 0) return 1;
1735 if (c1 < 0 || psize <= c1) return 1;
1737 if (p == 0) return 1;
1740 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1742 if (val == 0) return 1;
1743 if (no_cp932ext_f && (
1744 (val>>8) == 0x2D || /* NEC special characters */
1745 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1753 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1761 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1763 const unsigned short *const *pp;
1764 const unsigned short *const *const *ppp;
1765 static const char no_best_fit_chars_table_C2[] =
1766 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1767 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1768 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1769 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1770 static const char no_best_fit_chars_table_C2_ms[] =
1771 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1772 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1773 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1774 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1775 static const char no_best_fit_chars_table_932_C2[] =
1776 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1778 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1779 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1780 static const char no_best_fit_chars_table_932_C3[] =
1781 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1782 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1784 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1790 }else if(c2 < 0xe0){
1791 if(no_best_fit_chars_f){
1792 if(ms_ucs_map_f == UCS_MAP_CP932){
1795 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1798 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1801 }else if(!cp932inv_f){
1804 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1807 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1810 }else if(ms_ucs_map_f == UCS_MAP_MS){
1811 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1812 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1830 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1831 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1832 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1834 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1835 }else if(c0 < 0xF0){
1836 if(no_best_fit_chars_f){
1837 if(ms_ucs_map_f == UCS_MAP_CP932){
1838 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1839 }else if(ms_ucs_map_f == UCS_MAP_MS){
1844 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1847 if(c0 == 0x92) return 1;
1852 if(c1 == 0x80 || c0 == 0x9C) return 1;
1855 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1860 if(c0 == 0x94) return 1;
1863 if(c0 == 0xBB) return 1;
1873 if(c0 == 0x95) return 1;
1876 if(c0 == 0xA5) return 1;
1883 if(c0 == 0x8D) return 1;
1886 if(c0 == 0x9E && !cp932inv_f) return 1;
1889 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1897 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1898 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1899 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1901 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1903 #ifdef SHIFTJIS_CP932
1904 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1906 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1907 s2e_conv(s2, s1, p2, p1);
1916 #ifdef UTF8_OUTPUT_ENABLE
1918 e2w_conv(nkf_char c2, nkf_char c1)
1920 const unsigned short *p;
1922 if (c2 == JIS_X_0201_1976_K) {
1923 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1931 p = euc_to_utf8_1byte;
1933 } else if (is_eucg3(c2)){
1934 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1937 c2 = (c2&0x7f) - 0x21;
1938 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1939 p = x0212_to_utf8_2bytes[c2];
1945 c2 = (c2&0x7f) - 0x21;
1946 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1948 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1949 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1950 euc_to_utf8_2bytes_ms[c2];
1955 c1 = (c1 & 0x7f) - 0x21;
1956 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1963 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1970 }else if (0xc0 <= c2 && c2 <= 0xef) {
1971 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1972 #ifdef NUMCHAR_OPTION
1975 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1983 #ifdef UTF8_INPUT_ENABLE
1985 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1987 nkf_char c1, c2, c3, c4;
1994 else if (nkf_char_unicode_bmp_p(val)){
1995 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
1996 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
1999 *p1 = nkf_char_unicode_new(val);
2005 *p1 = nkf_char_unicode_new(val);
2012 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2014 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2015 if (iso2022jp_f && !x0201_f) {
2016 c2 = GETA1; c1 = GETA2;
2018 c2 = JIS_X_0201_1976_K;
2022 }else if (c2 == 0x8f){
2026 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2027 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2028 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2031 c2 = (c2 << 8) | (c1 & 0x7f);
2033 #ifdef SHIFTJIS_CP932
2036 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2037 s2e_conv(s2, s1, &c2, &c1);
2044 #endif /* SHIFTJIS_CP932 */
2046 #endif /* X0212_ENABLE */
2047 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2050 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2051 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2052 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2057 #ifdef SHIFTJIS_CP932
2058 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2060 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2061 s2e_conv(s2, s1, &c2, &c1);
2068 #endif /* SHIFTJIS_CP932 */
2076 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2078 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2079 if (iso2022jp_f && !x0201_f) {
2080 c2 = GETA1; c1 = GETA2;
2084 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2086 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2088 if(c1 == 0x7F) return 0;
2089 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2092 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2093 if (ret) return ret;
2100 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2102 nkf_char ret = 0, c4 = 0;
2103 static const char w_iconv_utf8_1st_byte[] =
2105 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2106 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2107 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2108 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2115 if (c1 < 0 || 0xff < c1) {
2116 }else if (c1 == 0) { /* 0 : 1 byte*/
2118 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2121 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2123 if (c2 < 0x80 || 0xBF < c2) return 0;
2126 if (c3 == 0) return -1;
2127 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2132 if (c3 == 0) return -1;
2133 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2137 if (c3 == 0) return -1;
2138 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2142 if (c3 == 0) return -2;
2143 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2147 if (c3 == 0) return -2;
2148 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2152 if (c3 == 0) return -2;
2153 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2161 if (c1 == 0 || c1 == EOF){
2162 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2163 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2166 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2174 #define NKF_ICONV_INVALID_CODE_RANGE -13
2176 unicode_iconv(nkf_char wc)
2184 }else if ((wc>>11) == 27) {
2185 /* unpaired surrogate */
2186 return NKF_ICONV_INVALID_CODE_RANGE;
2187 }else if (wc < 0xFFFF) {
2188 ret = w16e_conv(wc, &c2, &c1);
2189 if (ret) return ret;
2190 }else if (wc < 0x10FFFF) {
2192 c1 = nkf_char_unicode_new(wc);
2194 return NKF_ICONV_INVALID_CODE_RANGE;
2200 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2201 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2202 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2204 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2213 if (input_endian == ENDIAN_BIG) {
2214 if (0xD8 <= c1 && c1 <= 0xDB) {
2215 if (0xDC <= c3 && c3 <= 0xDF) {
2216 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2217 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2222 if (0xD8 <= c2 && c2 <= 0xDB) {
2223 if (0xDC <= c4 && c4 <= 0xDF) {
2224 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2225 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2231 return (*unicode_iconv)(wc);
2235 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2241 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2247 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2256 switch(input_endian){
2258 wc = c2 << 16 | c3 << 8 | c4;
2261 wc = c3 << 16 | c2 << 8 | c1;
2264 wc = c1 << 16 | c4 << 8 | c3;
2267 wc = c4 << 16 | c1 << 8 | c2;
2270 return NKF_ICONV_INVALID_CODE_RANGE;
2273 return (*unicode_iconv)(wc);
2277 #define output_ascii_escape_sequence(mode) do { \
2278 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2281 (*o_putc)(ascii_intro); \
2282 output_mode = mode; \
2287 output_escape_sequence(int mode)
2289 if (output_mode == mode)
2297 case JIS_X_0201_1976_K:
2305 (*o_putc)(kanji_intro);
2330 j_oconv(nkf_char c2, nkf_char c1)
2332 #ifdef NUMCHAR_OPTION
2333 if (c2 == 0 && nkf_char_unicode_p(c1)){
2334 w16e_conv(c1, &c2, &c1);
2335 if (c2 == 0 && nkf_char_unicode_p(c1)){
2336 c2 = c1 & VALUE_MASK;
2337 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2340 c2 = 0x7F + c1 / 94;
2341 c1 = 0x21 + c1 % 94;
2343 if (encode_fallback) (*encode_fallback)(c1);
2350 output_ascii_escape_sequence(ASCII);
2353 else if (c2 == EOF) {
2354 output_ascii_escape_sequence(ASCII);
2357 else if (c2 == ISO_8859_1) {
2358 output_ascii_escape_sequence(ISO_8859_1);
2361 else if (c2 == JIS_X_0201_1976_K) {
2362 output_escape_sequence(JIS_X_0201_1976_K);
2365 } else if (is_eucg3(c2)){
2366 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2367 (*o_putc)(c2 & 0x7f);
2372 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2373 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2374 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2381 e_oconv(nkf_char c2, nkf_char c1)
2383 if (c2 == 0 && nkf_char_unicode_p(c1)){
2384 w16e_conv(c1, &c2, &c1);
2385 if (c2 == 0 && nkf_char_unicode_p(c1)){
2386 c2 = c1 & VALUE_MASK;
2387 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2391 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2392 c1 = 0x21 + c1 % 94;
2395 (*o_putc)((c2 & 0x7f) | 0x080);
2396 (*o_putc)(c1 | 0x080);
2398 (*o_putc)((c2 & 0x7f) | 0x080);
2399 (*o_putc)(c1 | 0x080);
2403 if (encode_fallback) (*encode_fallback)(c1);
2411 } else if (c2 == 0) {
2412 output_mode = ASCII;
2414 } else if (c2 == JIS_X_0201_1976_K) {
2415 output_mode = EUC_JP;
2416 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2417 } else if (c2 == ISO_8859_1) {
2418 output_mode = ISO_8859_1;
2419 (*o_putc)(c1 | 0x080);
2421 } else if (is_eucg3(c2)){
2422 output_mode = EUC_JP;
2423 #ifdef SHIFTJIS_CP932
2426 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2427 s2e_conv(s2, s1, &c2, &c1);
2432 output_mode = ASCII;
2434 }else if (is_eucg3(c2)){
2437 (*o_putc)((c2 & 0x7f) | 0x080);
2438 (*o_putc)(c1 | 0x080);
2441 (*o_putc)((c2 & 0x7f) | 0x080);
2442 (*o_putc)(c1 | 0x080);
2446 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2447 set_iconv(FALSE, 0);
2448 return; /* too late to rescue this char */
2450 output_mode = EUC_JP;
2451 (*o_putc)(c2 | 0x080);
2452 (*o_putc)(c1 | 0x080);
2457 s_oconv(nkf_char c2, nkf_char c1)
2459 #ifdef NUMCHAR_OPTION
2460 if (c2 == 0 && nkf_char_unicode_p(c1)){
2461 w16e_conv(c1, &c2, &c1);
2462 if (c2 == 0 && nkf_char_unicode_p(c1)){
2463 c2 = c1 & VALUE_MASK;
2464 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2467 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2469 c1 += 0x40 + (c1 > 0x3e);
2474 if(encode_fallback)(*encode_fallback)(c1);
2483 } else if (c2 == 0) {
2484 output_mode = ASCII;
2486 } else if (c2 == JIS_X_0201_1976_K) {
2487 output_mode = SHIFT_JIS;
2489 } else if (c2 == ISO_8859_1) {
2490 output_mode = ISO_8859_1;
2491 (*o_putc)(c1 | 0x080);
2493 } else if (is_eucg3(c2)){
2494 output_mode = SHIFT_JIS;
2495 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2501 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2502 set_iconv(FALSE, 0);
2503 return; /* too late to rescue this char */
2505 output_mode = SHIFT_JIS;
2506 e2s_conv(c2, c1, &c2, &c1);
2508 #ifdef SHIFTJIS_CP932
2510 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2511 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2517 #endif /* SHIFTJIS_CP932 */
2520 if (prefix_table[(unsigned char)c1]){
2521 (*o_putc)(prefix_table[(unsigned char)c1]);
2527 #ifdef UTF8_OUTPUT_ENABLE
2529 w_oconv(nkf_char c2, nkf_char c1)
2535 output_bom_f = FALSE;
2546 if (c2 == 0 && nkf_char_unicode_p(c1)){
2547 val = c1 & VALUE_MASK;
2548 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2550 if (c2) (*o_putc)(c2);
2551 if (c3) (*o_putc)(c3);
2552 if (c4) (*o_putc)(c4);
2559 val = e2w_conv(c2, c1);
2561 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2563 if (c2) (*o_putc)(c2);
2564 if (c3) (*o_putc)(c3);
2565 if (c4) (*o_putc)(c4);
2571 w_oconv16(nkf_char c2, nkf_char c1)
2574 output_bom_f = FALSE;
2575 if (output_endian == ENDIAN_LITTLE){
2589 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2590 if (nkf_char_unicode_bmp_p(c1)) {
2591 c2 = (c1 >> 8) & 0xff;
2595 if (c1 <= UNICODE_MAX) {
2596 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2597 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2598 if (output_endian == ENDIAN_LITTLE){
2599 (*o_putc)(c2 & 0xff);
2600 (*o_putc)((c2 >> 8) & 0xff);
2601 (*o_putc)(c1 & 0xff);
2602 (*o_putc)((c1 >> 8) & 0xff);
2604 (*o_putc)((c2 >> 8) & 0xff);
2605 (*o_putc)(c2 & 0xff);
2606 (*o_putc)((c1 >> 8) & 0xff);
2607 (*o_putc)(c1 & 0xff);
2613 nkf_char val = e2w_conv(c2, c1);
2614 c2 = (val >> 8) & 0xff;
2619 if (output_endian == ENDIAN_LITTLE){
2629 w_oconv32(nkf_char c2, nkf_char c1)
2632 output_bom_f = FALSE;
2633 if (output_endian == ENDIAN_LITTLE){
2651 if (c2 == ISO_8859_1) {
2653 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2656 c1 = e2w_conv(c2, c1);
2659 if (output_endian == ENDIAN_LITTLE){
2660 (*o_putc)( c1 & 0xFF);
2661 (*o_putc)((c1 >> 8) & 0xFF);
2662 (*o_putc)((c1 >> 16) & 0xFF);
2666 (*o_putc)((c1 >> 16) & 0xFF);
2667 (*o_putc)((c1 >> 8) & 0xFF);
2668 (*o_putc)( c1 & 0xFF);
2673 #define SCORE_L2 (1) /* Kanji Level 2 */
2674 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2675 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2676 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2677 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2678 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */
2679 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2680 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2682 #define SCORE_INIT (SCORE_iMIME)
2684 static const nkf_char score_table_A0[] = {
2687 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2688 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2691 static const nkf_char score_table_F0[] = {
2692 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2693 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2694 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2695 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2699 set_code_score(struct input_code *ptr, nkf_char score)
2702 ptr->score |= score;
2707 clr_code_score(struct input_code *ptr, nkf_char score)
2710 ptr->score &= ~score;
2715 code_score(struct input_code *ptr)
2717 nkf_char c2 = ptr->buf[0];
2718 #ifdef UTF8_OUTPUT_ENABLE
2719 nkf_char c1 = ptr->buf[1];
2722 set_code_score(ptr, SCORE_ERROR);
2723 }else if (c2 == SS2){
2724 set_code_score(ptr, SCORE_KANA);
2725 }else if (c2 == 0x8f){
2726 set_code_score(ptr, SCORE_X0212);
2727 #ifdef UTF8_OUTPUT_ENABLE
2728 }else if (!e2w_conv(c2, c1)){
2729 set_code_score(ptr, SCORE_NO_EXIST);
2731 }else if ((c2 & 0x70) == 0x20){
2732 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2733 }else if ((c2 & 0x70) == 0x70){
2734 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2735 }else if ((c2 & 0x70) >= 0x50){
2736 set_code_score(ptr, SCORE_L2);
2741 status_disable(struct input_code *ptr)
2746 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2750 status_push_ch(struct input_code *ptr, nkf_char c)
2752 ptr->buf[ptr->index++] = c;
2756 status_clear(struct input_code *ptr)
2763 status_reset(struct input_code *ptr)
2766 ptr->score = SCORE_INIT;
2770 status_reinit(struct input_code *ptr)
2773 ptr->_file_stat = 0;
2777 status_check(struct input_code *ptr, nkf_char c)
2779 if (c <= DEL && estab_f){
2785 s_status(struct input_code *ptr, nkf_char c)
2789 status_check(ptr, c);
2794 }else if (nkf_char_unicode_p(c)){
2796 }else if (0xa1 <= c && c <= 0xdf){
2797 status_push_ch(ptr, SS2);
2798 status_push_ch(ptr, c);
2801 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2803 status_push_ch(ptr, c);
2804 }else if (0xed <= c && c <= 0xee){
2806 status_push_ch(ptr, c);
2807 #ifdef SHIFTJIS_CP932
2808 }else if (is_ibmext_in_sjis(c)){
2810 status_push_ch(ptr, c);
2811 #endif /* SHIFTJIS_CP932 */
2813 }else if (0xf0 <= c && c <= 0xfc){
2815 status_push_ch(ptr, c);
2816 #endif /* X0212_ENABLE */
2818 status_disable(ptr);
2822 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2823 status_push_ch(ptr, c);
2824 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2828 status_disable(ptr);
2832 #ifdef SHIFTJIS_CP932
2833 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2834 status_push_ch(ptr, c);
2835 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2836 set_code_score(ptr, SCORE_CP932);
2841 #endif /* SHIFTJIS_CP932 */
2842 status_disable(ptr);
2845 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2846 status_push_ch(ptr, c);
2847 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2848 set_code_score(ptr, SCORE_CP932);
2851 status_disable(ptr);
2858 e_status(struct input_code *ptr, nkf_char c)
2862 status_check(ptr, c);
2867 }else if (nkf_char_unicode_p(c)){
2869 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2871 status_push_ch(ptr, c);
2873 }else if (0x8f == c){
2875 status_push_ch(ptr, c);
2876 #endif /* X0212_ENABLE */
2878 status_disable(ptr);
2882 if (0xa1 <= c && c <= 0xfe){
2883 status_push_ch(ptr, c);
2887 status_disable(ptr);
2892 if (0xa1 <= c && c <= 0xfe){
2894 status_push_ch(ptr, c);
2896 status_disable(ptr);
2898 #endif /* X0212_ENABLE */
2902 #ifdef UTF8_INPUT_ENABLE
2904 w_status(struct input_code *ptr, nkf_char c)
2908 status_check(ptr, c);
2913 }else if (nkf_char_unicode_p(c)){
2915 }else if (0xc0 <= c && c <= 0xdf){
2917 status_push_ch(ptr, c);
2918 }else if (0xe0 <= c && c <= 0xef){
2920 status_push_ch(ptr, c);
2921 }else if (0xf0 <= c && c <= 0xf4){
2923 status_push_ch(ptr, c);
2925 status_disable(ptr);
2930 if (0x80 <= c && c <= 0xbf){
2931 status_push_ch(ptr, c);
2932 if (ptr->index > ptr->stat){
2933 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2934 && ptr->buf[2] == 0xbf);
2935 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2936 &ptr->buf[0], &ptr->buf[1]);
2943 status_disable(ptr);
2947 if (0x80 <= c && c <= 0xbf){
2948 if (ptr->index < ptr->stat){
2949 status_push_ch(ptr, c);
2954 status_disable(ptr);
2962 code_status(nkf_char c)
2964 int action_flag = 1;
2965 struct input_code *result = 0;
2966 struct input_code *p = input_code_list;
2968 if (!p->status_func) {
2972 if (!p->status_func)
2974 (p->status_func)(p, c);
2977 }else if(p->stat == 0){
2988 if (result && !estab_f){
2989 set_iconv(TRUE, result->iconv_func);
2990 }else if (c <= DEL){
2991 struct input_code *ptr = input_code_list;
3005 return std_gc_buf[--std_gc_ndx];
3012 std_ungetc(nkf_char c, FILE *f)
3014 if (std_gc_ndx == STD_GC_BUFSIZE){
3017 std_gc_buf[std_gc_ndx++] = c;
3023 std_putc(nkf_char c)
3030 static unsigned char hold_buf[HOLD_SIZE*2];
3031 static int hold_count = 0;
3033 push_hold_buf(nkf_char c2)
3035 if (hold_count >= HOLD_SIZE*2)
3037 hold_buf[hold_count++] = (unsigned char)c2;
3038 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3042 h_conv(FILE *f, int c1, int c2)
3048 /** it must NOT be in the kanji shifte sequence */
3049 /** it must NOT be written in JIS7 */
3050 /** and it must be after 2 byte 8bit code */
3056 while ((c2 = (*i_getc)(f)) != EOF) {
3062 if (push_hold_buf(c2) == EOF || estab_f) {
3068 struct input_code *p = input_code_list;
3069 struct input_code *result = p;
3074 if (p->status_func && p->score < result->score) {
3079 set_iconv(TRUE, result->iconv_func);
3084 ** 1) EOF is detected, or
3085 ** 2) Code is established, or
3086 ** 3) Buffer is FULL (but last word is pushed)
3088 ** in 1) and 3) cases, we continue to use
3089 ** Kanji codes by oconv and leave estab_f unchanged.
3094 while (hold_index < hold_count){
3095 c1 = hold_buf[hold_index++];
3099 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3100 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3103 if (hold_index < hold_count){
3104 c2 = hold_buf[hold_index++];
3114 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3117 if (hold_index < hold_count){
3118 c3 = hold_buf[hold_index++];
3119 } else if ((c3 = (*i_getc)(f)) == EOF) {
3124 if (hold_index < hold_count){
3125 c4 = hold_buf[hold_index++];
3126 } else if ((c4 = (*i_getc)(f)) == EOF) {
3131 (*iconv)(c1, c2, (c3<<8)|c4);
3136 /* 3 bytes EUC or UTF-8 */
3137 if (hold_index < hold_count){
3138 c3 = hold_buf[hold_index++];
3139 } else if ((c3 = (*i_getc)(f)) == EOF) {
3145 (*iconv)(c1, c2, c3);
3148 if (c3 == EOF) break;
3154 * Check and Ignore BOM
3160 switch(c2 = (*i_getc)(f)){
3162 if((c2 = (*i_getc)(f)) == 0x00){
3163 if((c2 = (*i_getc)(f)) == 0xFE){
3164 if((c2 = (*i_getc)(f)) == 0xFF){
3165 if(!input_encoding){
3166 set_iconv(TRUE, w_iconv32);
3168 if (iconv == w_iconv32) {
3169 input_endian = ENDIAN_BIG;
3172 (*i_ungetc)(0xFF,f);
3173 }else (*i_ungetc)(c2,f);
3174 (*i_ungetc)(0xFE,f);
3175 }else if(c2 == 0xFF){
3176 if((c2 = (*i_getc)(f)) == 0xFE){
3177 if(!input_encoding){
3178 set_iconv(TRUE, w_iconv32);
3180 if (iconv == w_iconv32) {
3181 input_endian = ENDIAN_2143;
3184 (*i_ungetc)(0xFF,f);
3185 }else (*i_ungetc)(c2,f);
3186 (*i_ungetc)(0xFF,f);
3187 }else (*i_ungetc)(c2,f);
3188 (*i_ungetc)(0x00,f);
3189 }else (*i_ungetc)(c2,f);
3190 (*i_ungetc)(0x00,f);
3193 if((c2 = (*i_getc)(f)) == 0xBB){
3194 if((c2 = (*i_getc)(f)) == 0xBF){
3195 if(!input_encoding){
3196 set_iconv(TRUE, w_iconv);
3198 if (iconv == w_iconv) {
3201 (*i_ungetc)(0xBF,f);
3202 }else (*i_ungetc)(c2,f);
3203 (*i_ungetc)(0xBB,f);
3204 }else (*i_ungetc)(c2,f);
3205 (*i_ungetc)(0xEF,f);
3208 if((c2 = (*i_getc)(f)) == 0xFF){
3209 if((c2 = (*i_getc)(f)) == 0x00){
3210 if((c2 = (*i_getc)(f)) == 0x00){
3211 if(!input_encoding){
3212 set_iconv(TRUE, w_iconv32);
3214 if (iconv == w_iconv32) {
3215 input_endian = ENDIAN_3412;
3218 (*i_ungetc)(0x00,f);
3219 }else (*i_ungetc)(c2,f);
3220 (*i_ungetc)(0x00,f);
3221 }else (*i_ungetc)(c2,f);
3222 if(!input_encoding){
3223 set_iconv(TRUE, w_iconv16);
3225 if (iconv == w_iconv16) {
3226 input_endian = ENDIAN_BIG;
3229 (*i_ungetc)(0xFF,f);
3230 }else (*i_ungetc)(c2,f);
3231 (*i_ungetc)(0xFE,f);
3234 if((c2 = (*i_getc)(f)) == 0xFE){
3235 if((c2 = (*i_getc)(f)) == 0x00){
3236 if((c2 = (*i_getc)(f)) == 0x00){
3237 if(!input_encoding){
3238 set_iconv(TRUE, w_iconv32);
3240 if (iconv == w_iconv32) {
3241 input_endian = ENDIAN_LITTLE;
3244 (*i_ungetc)(0x00,f);
3245 }else (*i_ungetc)(c2,f);
3246 (*i_ungetc)(0x00,f);
3247 }else (*i_ungetc)(c2,f);
3248 if(!input_encoding){
3249 set_iconv(TRUE, w_iconv16);
3251 if (iconv == w_iconv16) {
3252 input_endian = ENDIAN_LITTLE;
3255 (*i_ungetc)(0xFE,f);
3256 }else (*i_ungetc)(c2,f);
3257 (*i_ungetc)(0xFF,f);
3272 init_broken_state(void)
3274 memset(&broken_state, 0, sizeof(broken_state));
3280 broken_state.buf[broken_state.count++] = c;
3284 pop_broken_buf(void)
3286 return broken_state.buf[--broken_state.count];
3290 broken_getc(FILE *f)
3294 if (broken_state.count > 0) {
3295 return pop_broken_buf();
3298 if (c=='$' && broken_state.status != ESC
3299 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3301 broken_state.status = 0;
3302 if (c1=='@'|| c1=='B') {
3303 push_broken_buf(c1);
3310 } else if (c=='(' && broken_state.status != ESC
3311 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3313 broken_state.status = 0;
3314 if (c1=='J'|| c1=='B') {
3315 push_broken_buf(c1);
3323 broken_state.status = c;
3329 broken_ungetc(nkf_char c, FILE *f)
3331 if (broken_state.count < 2)
3337 eol_conv(nkf_char c2, nkf_char c1)
3339 if (guess_f && input_eol != EOF) {
3340 if (c2 == 0 && c1 == LF) {
3341 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3342 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3343 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3345 else if (!input_eol) input_eol = CR;
3346 else if (input_eol != CR) input_eol = EOF;
3348 if (prev_cr || (c2 == 0 && c1 == LF)) {
3350 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3351 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3353 if (c2 == 0 && c1 == CR) prev_cr = CR;
3354 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3358 Return value of fold_conv()
3360 LF add newline and output char
3361 CR add newline and output nothing
3364 1 (or else) normal output
3366 fold state in prev (previous character)
3368 >0x80 Japanese (X0208/X0201)
3373 This fold algorthm does not preserve heading space in a line.
3374 This is the main difference from fmt.
3377 #define char_size(c2,c1) (c2?2:1)
3380 fold_conv(nkf_char c2, nkf_char c1)
3383 nkf_char fold_state;
3385 if (c1== CR && !fold_preserve_f) {
3386 fold_state=0; /* ignore cr */
3387 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3389 fold_state=0; /* ignore cr */
3390 } else if (c1== BS) {
3391 if (f_line>0) f_line--;
3393 } else if (c2==EOF && f_line != 0) { /* close open last line */
3395 } else if ((c1==LF && !fold_preserve_f)
3396 || ((c1==CR||(c1==LF&&f_prev!=CR))
3397 && fold_preserve_f)) {
3399 if (fold_preserve_f) {
3403 } else if ((f_prev == c1 && !fold_preserve_f)
3404 || (f_prev == LF && fold_preserve_f)
3405 ) { /* duplicate newline */
3408 fold_state = LF; /* output two newline */
3414 if (f_prev&0x80) { /* Japanese? */
3416 fold_state = 0; /* ignore given single newline */
3417 } else if (f_prev==SP) {
3421 if (++f_line<=fold_len)
3425 fold_state = CR; /* fold and output nothing */
3429 } else if (c1=='\f') {
3432 fold_state = LF; /* output newline and clear */
3433 } else if ( (c2==0 && c1==SP)||
3434 (c2==0 && c1==TAB)||
3435 (c2=='!'&& c1=='!')) {
3436 /* X0208 kankaku or ascii space */
3438 fold_state = 0; /* remove duplicate spaces */
3441 if (++f_line<=fold_len)
3442 fold_state = SP; /* output ASCII space only */
3444 f_prev = SP; f_line = 0;
3445 fold_state = CR; /* fold and output nothing */
3449 prev0 = f_prev; /* we still need this one... , but almost done */
3451 if (c2 || c2 == JIS_X_0201_1976_K)
3452 f_prev |= 0x80; /* this is Japanese */
3453 f_line += char_size(c2,c1);
3454 if (f_line<=fold_len) { /* normal case */
3457 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3458 f_line = char_size(c2,c1);
3459 fold_state = LF; /* We can't wait, do fold now */
3460 } else if (c2 == JIS_X_0201_1976_K) {
3461 /* simple kinsoku rules return 1 means no folding */
3462 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3463 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3464 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3465 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3466 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3467 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3468 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3470 fold_state = LF;/* add one new f_line before this character */
3473 fold_state = LF;/* add one new f_line before this character */
3476 /* kinsoku point in ASCII */
3477 if ( c1==')'|| /* { [ ( */
3488 /* just after special */
3489 } else if (!is_alnum(prev0)) {
3490 f_line = char_size(c2,c1);
3492 } else if ((prev0==SP) || /* ignored new f_line */
3493 (prev0==LF)|| /* ignored new f_line */
3494 (prev0&0x80)) { /* X0208 - ASCII */
3495 f_line = char_size(c2,c1);
3496 fold_state = LF;/* add one new f_line before this character */
3498 fold_state = 1; /* default no fold in ASCII */
3502 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3503 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3504 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3505 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3506 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3507 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3508 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3509 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3510 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3511 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3512 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3513 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3514 /* default no fold in kinsoku */
3517 f_line = char_size(c2,c1);
3518 /* add one new f_line before this character */
3521 f_line = char_size(c2,c1);
3523 /* add one new f_line before this character */
3528 /* terminator process */
3529 switch(fold_state) {
3531 OCONV_NEWLINE((*o_fconv));
3537 OCONV_NEWLINE((*o_fconv));
3548 static nkf_char z_prev2=0,z_prev1=0;
3551 z_conv(nkf_char c2, nkf_char c1)
3554 /* if (c2) c1 &= 0x7f; assertion */
3556 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3562 if (z_prev2 == JIS_X_0201_1976_K) {
3563 if (c2 == JIS_X_0201_1976_K) {
3564 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3566 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3568 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3570 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3575 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3577 if (c2 == JIS_X_0201_1976_K) {
3578 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3579 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3584 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3595 if (alpha_f&1 && c2 == 0x23) {
3596 /* JISX0208 Alphabet */
3598 } else if (c2 == 0x21) {
3599 /* JISX0208 Kigou */
3604 } else if (alpha_f&4) {
3609 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3615 if (alpha_f&8 && c2 == 0) {
3617 const char *entity = 0;
3619 case '>': entity = ">"; break;
3620 case '<': entity = "<"; break;
3621 case '\"': entity = """; break;
3622 case '&': entity = "&"; break;
3625 while (*entity) (*o_zconv)(0, *entity++);
3631 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3636 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3640 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3644 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3648 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3652 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3656 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3660 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3664 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3669 (*o_zconv)(JIS_X_0201_1976_K, c);
3672 } else if (c2 == 0x25) {
3673 /* JISX0208 Katakana */
3674 static const int fullwidth_to_halfwidth[] =
3676 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3677 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3678 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3679 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3680 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3681 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3682 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3683 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3684 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3685 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3686 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3687 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3689 if (fullwidth_to_halfwidth[c1-0x20]){
3690 c2 = fullwidth_to_halfwidth[c1-0x20];
3691 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3693 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3703 #define rot13(c) ( \
3705 (c <= 'M') ? (c + 13): \
3706 (c <= 'Z') ? (c - 13): \
3708 (c <= 'm') ? (c + 13): \
3709 (c <= 'z') ? (c - 13): \
3713 #define rot47(c) ( \
3715 ( c <= 'O') ? (c + 47) : \
3716 ( c <= '~') ? (c - 47) : \
3721 rot_conv(nkf_char c2, nkf_char c1)
3723 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3729 (*o_rot_conv)(c2,c1);
3733 hira_conv(nkf_char c2, nkf_char c1)
3737 if (0x20 < c1 && c1 < 0x74) {
3739 (*o_hira_conv)(c2,c1);
3741 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3743 c1 = nkf_char_unicode_new(0x3094);
3744 (*o_hira_conv)(c2,c1);
3747 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3749 (*o_hira_conv)(c2,c1);
3754 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3757 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3759 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3763 (*o_hira_conv)(c2,c1);
3768 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3770 #define RANGE_NUM_MAX 18
3771 static const nkf_char range[RANGE_NUM_MAX][2] = {
3792 nkf_char start, end, c;
3794 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3798 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3803 for (i = 0; i < RANGE_NUM_MAX; i++) {
3804 start = range[i][0];
3807 if (c >= start && c <= end) {
3812 (*o_iso2022jp_check_conv)(c2,c1);
3816 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3818 static const unsigned char *mime_pattern[] = {
3819 (const unsigned char *)"\075?EUC-JP?B?",
3820 (const unsigned char *)"\075?SHIFT_JIS?B?",
3821 (const unsigned char *)"\075?ISO-8859-1?Q?",
3822 (const unsigned char *)"\075?ISO-8859-1?B?",
3823 (const unsigned char *)"\075?ISO-2022-JP?B?",
3824 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3825 #if defined(UTF8_INPUT_ENABLE)
3826 (const unsigned char *)"\075?UTF-8?B?",
3827 (const unsigned char *)"\075?UTF-8?Q?",
3829 (const unsigned char *)"\075?US-ASCII?Q?",
3834 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3835 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3836 e_iconv, s_iconv, 0, 0, 0, 0,
3837 #if defined(UTF8_INPUT_ENABLE)
3843 static const nkf_char mime_encode[] = {
3844 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3845 #if defined(UTF8_INPUT_ENABLE)
3852 static const nkf_char mime_encode_method[] = {
3853 'B', 'B','Q', 'B', 'B', 'Q',
3854 #if defined(UTF8_INPUT_ENABLE)
3862 /* MIME preprocessor fifo */
3864 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3865 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3866 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3868 unsigned char buf[MIME_BUF_SIZE];
3870 unsigned int last; /* decoded */
3871 unsigned int input; /* undecoded */
3873 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3875 #define MAXRECOVER 20
3878 mime_input_buf_unshift(nkf_char c)
3880 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3884 mime_ungetc(nkf_char c, FILE *f)
3886 mime_input_buf_unshift(c);
3891 mime_ungetc_buf(nkf_char c, FILE *f)
3894 (*i_mungetc_buf)(c,f);
3896 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3901 mime_getc_buf(FILE *f)
3903 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3904 a terminator. It was checked in mime_integrity. */
3905 return ((mimebuf_f)?
3906 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3910 switch_mime_getc(void)
3912 if (i_getc!=mime_getc) {
3913 i_mgetc = i_getc; i_getc = mime_getc;
3914 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3915 if(mime_f==STRICT_MIME) {
3916 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3917 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3923 unswitch_mime_getc(void)
3925 if(mime_f==STRICT_MIME) {
3926 i_mgetc = i_mgetc_buf;
3927 i_mungetc = i_mungetc_buf;
3930 i_ungetc = i_mungetc;
3931 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3932 mime_iconv_back = NULL;
3936 mime_integrity(FILE *f, const unsigned char *p)
3940 /* In buffered mode, read until =? or NL or buffer full
3942 mime_input_state.input = mime_input_state.top;
3943 mime_input_state.last = mime_input_state.top;
3945 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3947 q = mime_input_state.input;
3948 while((c=(*i_getc)(f))!=EOF) {
3949 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3950 break; /* buffer full */
3952 if (c=='=' && d=='?') {
3953 /* checked. skip header, start decode */
3954 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3955 /* mime_last_input = mime_input_state.input; */
3956 mime_input_state.input = q;
3960 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3962 /* Should we check length mod 4? */
3963 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3966 /* In case of Incomplete MIME, no MIME decode */
3967 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3968 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3969 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3970 switch_mime_getc(); /* anyway we need buffered getc */
3975 mime_begin_strict(FILE *f)
3979 const unsigned char *p,*q;
3980 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3982 mime_decode_mode = FALSE;
3983 /* =? has been checked */
3985 p = mime_pattern[j];
3988 for(i=2;p[i]>SP;i++) { /* start at =? */
3989 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
3990 /* pattern fails, try next one */
3992 while (mime_pattern[++j]) {
3993 p = mime_pattern[j];
3994 for(k=2;k<i;k++) /* assume length(p) > i */
3995 if (p[k]!=q[k]) break;
3996 if (k==i && nkf_toupper(c1)==p[k]) break;
3998 p = mime_pattern[j];
3999 if (p) continue; /* found next one, continue */
4000 /* all fails, output from recovery buffer */
4008 mime_decode_mode = p[i-2];
4010 mime_iconv_back = iconv;
4011 set_iconv(FALSE, mime_priority_func[j]);
4012 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4014 if (mime_decode_mode=='B') {
4015 mimebuf_f = unbuf_f;
4017 /* do MIME integrity check */
4018 return mime_integrity(f,mime_pattern[j]);
4032 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4033 /* re-read and convert again from mime_buffer. */
4035 /* =? has been checked */
4036 k = mime_input_state.last;
4037 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4038 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4039 /* We accept any character type even if it is breaked by new lines */
4040 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4041 if (c1==LF||c1==SP||c1==CR||
4042 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4044 /* Failed. But this could be another MIME preemble */
4046 mime_input_state.last--;
4052 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4053 if (!(++i<MAXRECOVER) || c1==EOF) break;
4054 if (c1=='b'||c1=='B') {
4055 mime_decode_mode = 'B';
4056 } else if (c1=='q'||c1=='Q') {
4057 mime_decode_mode = 'Q';
4061 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4062 if (!(++i<MAXRECOVER) || c1==EOF) break;
4064 mime_decode_mode = FALSE;
4070 if (!mime_decode_mode) {
4071 /* false MIME premble, restart from mime_buffer */
4072 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4073 /* Since we are in MIME mode until buffer becomes empty, */
4074 /* we never go into mime_begin again for a while. */
4077 /* discard mime preemble, and goto MIME mode */
4078 mime_input_state.last = k;
4079 /* do no MIME integrity check */
4080 return c1; /* used only for checking EOF */
4091 debug(const char *str)
4094 fprintf(stderr, "%s\n", str ? str : "NULL");
4100 set_input_codename(const char *codename)
4102 if (!input_codename) {
4103 input_codename = codename;
4104 } else if (strcmp(codename, input_codename) != 0) {
4105 input_codename = "";
4110 get_guessed_code(void)
4112 if (input_codename && !*input_codename) {
4113 input_codename = "BINARY";
4115 struct input_code *p = find_inputcode_byfunc(iconv);
4116 if (!input_codename) {
4117 input_codename = "ASCII";
4118 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4119 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4120 input_codename = "CP932";
4121 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4122 if (p->score & (SCORE_X0212))
4123 input_codename = "EUCJP-MS";
4124 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4125 input_codename = "CP51932";
4126 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4127 if (p->score & (SCORE_KANA))
4128 input_codename = "CP50221";
4129 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4130 input_codename = "CP50220";
4133 return input_codename;
4136 #if !defined(PERL_XS) && !defined(WIN32DLL)
4138 print_guessed_code(char *filename)
4140 if (filename != NULL) printf("%s: ", filename);
4141 if (input_codename && !*input_codename) {
4144 input_codename = get_guessed_code();
4146 printf("%s\n", input_codename);
4150 input_eol == CR ? " (CR)" :
4151 input_eol == LF ? " (LF)" :
4152 input_eol == CRLF ? " (CRLF)" :
4153 input_eol == EOF ? " (MIXED NL)" :
4163 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4165 nkf_char c1, c2, c3;
4171 if (!nkf_isxdigit(c2)){
4176 if (!nkf_isxdigit(c3)){
4181 return (hex2bin(c2) << 4) | hex2bin(c3);
4187 return hex_getc(':', f, i_cgetc, i_cungetc);
4191 cap_ungetc(nkf_char c, FILE *f)
4193 return (*i_cungetc)(c, f);
4199 return hex_getc('%', f, i_ugetc, i_uungetc);
4203 url_ungetc(nkf_char c, FILE *f)
4205 return (*i_uungetc)(c, f);
4209 #ifdef NUMCHAR_OPTION
4211 numchar_getc(FILE *f)
4213 nkf_char (*g)(FILE *) = i_ngetc;
4214 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4225 if (buf[i] == 'x' || buf[i] == 'X'){
4226 for (j = 0; j < 7; j++){
4228 if (!nkf_isxdigit(buf[i])){
4235 c |= hex2bin(buf[i]);
4238 for (j = 0; j < 8; j++){
4242 if (!nkf_isdigit(buf[i])){
4249 c += hex2bin(buf[i]);
4255 return nkf_char_unicode_new(c);
4265 numchar_ungetc(nkf_char c, FILE *f)
4267 return (*i_nungetc)(c, f);
4271 #ifdef UNICODE_NORMALIZATION
4276 nkf_char (*g)(FILE *f) = i_nfc_getc;
4277 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4278 nkf_buf_t *buf = nkf_buf_new(9);
4279 const unsigned char *array;
4280 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4281 nkf_char c = (*g)(f);
4283 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4285 nkf_buf_push(buf, (unsigned char)c);
4287 while (lower <= upper) {
4288 int mid = (lower+upper) / 2;
4290 array = normalization_table[mid].nfd;
4291 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4292 if (len >= nkf_buf_length(buf)) {
4296 lower = 1, upper = 0;
4299 nkf_buf_push(buf, c);
4301 if (array[len] != nkf_buf_at(buf, len)) {
4302 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4303 else upper = mid - 1;
4310 array = normalization_table[mid].nfc;
4312 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4313 nkf_buf_push(buf, array[i]);
4317 } while (lower <= upper);
4319 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4320 c = nkf_buf_pop(buf);
4321 nkf_buf_dispose(buf);
4327 nfc_ungetc(nkf_char c, FILE *f)
4329 return (*i_nfc_ungetc)(c, f);
4331 #endif /* UNICODE_NORMALIZATION */
4335 base64decode(nkf_char c)
4340 i = c - 'A'; /* A..Z 0-25 */
4341 } else if (c == '_') {
4342 i = '?' /* 63 */ ; /* _ 63 */
4344 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4346 } else if (c > '/') {
4347 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4348 } else if (c == '+' || c == '-') {
4349 i = '>' /* 62 */ ; /* + and - 62 */
4351 i = '?' /* 63 */ ; /* / 63 */
4359 nkf_char c1, c2, c3, c4, cc;
4360 nkf_char t1, t2, t3, t4, mode, exit_mode;
4361 nkf_char lwsp_count;
4364 nkf_char lwsp_size = 128;
4366 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4367 return mime_input_buf(mime_input_state.top++);
4369 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4370 mime_decode_mode=FALSE;
4371 unswitch_mime_getc();
4372 return (*i_getc)(f);
4375 if (mimebuf_f == FIXED_MIME)
4376 exit_mode = mime_decode_mode;
4379 if (mime_decode_mode == 'Q') {
4380 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4382 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4383 if (c1<=SP || DEL<=c1) {
4384 mime_decode_mode = exit_mode; /* prepare for quit */
4387 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4391 mime_decode_mode = exit_mode; /* prepare for quit */
4392 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4393 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4394 /* end Q encoding */
4395 input_mode = exit_mode;
4397 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4398 while ((c1=(*i_getc)(f))!=EOF) {
4403 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4411 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4412 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4427 lwsp_buf[lwsp_count] = (unsigned char)c1;
4428 if (lwsp_count++>lwsp_size){
4430 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4431 lwsp_buf = lwsp_buf_new;
4437 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4439 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4440 i_ungetc(lwsp_buf[lwsp_count],f);
4443 nkf_xfree(lwsp_buf);
4446 if (c1=='='&&c2<SP) { /* this is soft wrap */
4447 while((c1 = (*i_mgetc)(f)) <=SP) {
4448 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4450 mime_decode_mode = 'Q'; /* still in MIME */
4451 goto restart_mime_q;
4454 mime_decode_mode = 'Q'; /* still in MIME */
4458 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4459 if (c2<=SP) return c2;
4460 mime_decode_mode = 'Q'; /* still in MIME */
4461 return ((hex2bin(c2)<<4) + hex2bin(c3));
4464 if (mime_decode_mode != 'B') {
4465 mime_decode_mode = FALSE;
4466 return (*i_mgetc)(f);
4470 /* Base64 encoding */
4472 MIME allows line break in the middle of
4473 Base64, but we are very pessimistic in decoding
4474 in unbuf mode because MIME encoded code may broken by
4475 less or editor's control sequence (such as ESC-[-K in unbuffered
4476 mode. ignore incomplete MIME.
4478 mode = mime_decode_mode;
4479 mime_decode_mode = exit_mode; /* prepare for quit */
4481 while ((c1 = (*i_mgetc)(f))<=SP) {
4486 if ((c2 = (*i_mgetc)(f))<=SP) {
4489 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4490 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4493 if ((c1 == '?') && (c2 == '=')) {
4496 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4497 while ((c1=(*i_getc)(f))!=EOF) {
4502 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4510 if ((c1=(*i_getc)(f))!=EOF) {
4514 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4529 lwsp_buf[lwsp_count] = (unsigned char)c1;
4530 if (lwsp_count++>lwsp_size){
4532 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4533 lwsp_buf = lwsp_buf_new;
4539 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4541 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4542 i_ungetc(lwsp_buf[lwsp_count],f);
4545 nkf_xfree(lwsp_buf);
4549 if ((c3 = (*i_mgetc)(f))<=SP) {
4552 if (mime_f != STRICT_MIME) goto mime_c3_retry;
4553 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4557 if ((c4 = (*i_mgetc)(f))<=SP) {
4560 if (mime_f != STRICT_MIME) goto mime_c4_retry;
4561 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4565 mime_decode_mode = mode; /* still in MIME sigh... */
4567 /* BASE 64 decoding */
4569 t1 = 0x3f & base64decode(c1);
4570 t2 = 0x3f & base64decode(c2);
4571 t3 = 0x3f & base64decode(c3);
4572 t4 = 0x3f & base64decode(c4);
4573 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
4575 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4576 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
4578 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4579 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
4581 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
4586 return mime_input_buf(mime_input_state.top++);
4589 static const char basis_64[] =
4590 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
4592 #define MIMEOUT_BUF_LENGTH 74
4594 char buf[MIMEOUT_BUF_LENGTH+1];
4599 /*nkf_char mime_lastchar2, mime_lastchar1;*/
4602 open_mime(nkf_char mode)
4604 const unsigned char *p;
4607 p = mime_pattern[0];
4608 for(i=0;mime_pattern[i];i++) {
4609 if (mode == mime_encode[i]) {
4610 p = mime_pattern[i];
4614 mimeout_mode = mime_encode_method[i];
4616 if (base64_count>45) {
4617 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
4618 (*o_mputc)(mimeout_state.buf[i]);
4621 PUT_NEWLINE((*o_mputc));
4624 if (mimeout_state.count>0
4625 && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4626 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) {
4630 for (;i<mimeout_state.count;i++) {
4631 if (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
4632 || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) {
4633 (*o_mputc)(mimeout_state.buf[i]);
4643 j = mimeout_state.count;
4644 mimeout_state.count = 0;
4646 mime_putc(mimeout_state.buf[i]);
4651 mime_prechar(nkf_char c2, nkf_char c1)
4653 if (mimeout_mode > 0){
4655 if (base64_count + mimeout_state.count/3*4> 73){
4656 (*o_base64conv)(EOF,0);
4657 OCONV_NEWLINE((*o_base64conv));
4658 (*o_base64conv)(0,SP);
4662 if (base64_count + mimeout_state.count/3*4> 66) {
4663 (*o_base64conv)(EOF,0);
4664 OCONV_NEWLINE((*o_base64conv));
4665 (*o_base64conv)(0,SP);
4671 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
4672 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
4673 open_mime(output_mode);
4674 (*o_base64conv)(EOF,0);
4675 OCONV_NEWLINE((*o_base64conv));
4676 (*o_base64conv)(0,SP);
4695 switch(mimeout_mode) {
4700 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]);
4706 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]);
4711 if (mimeout_mode > 0) {
4712 if (mimeout_f!=FIXED_MIME) {
4714 } else if (mimeout_mode != 'Q')
4720 mimeout_addchar(nkf_char c)
4722 switch(mimeout_mode) {
4727 } else if(!nkf_isalnum(c)) {
4729 (*o_mputc)(bin2hex(((c>>4)&0xf)));
4730 (*o_mputc)(bin2hex((c&0xf)));
4738 mimeout_state.state=c;
4739 (*o_mputc)(basis_64[c>>2]);
4744 (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
4745 mimeout_state.state=c;
4750 (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]);
4751 (*o_mputc)(basis_64[c & 0x3F]);
4763 mime_putc(nkf_char c)
4768 if (mimeout_f == FIXED_MIME){
4769 if (mimeout_mode == 'Q'){
4770 if (base64_count > 71){
4771 if (c!=CR && c!=LF) {
4773 PUT_NEWLINE((*o_mputc));
4778 if (base64_count > 71){
4780 PUT_NEWLINE((*o_mputc));
4783 if (c == EOF) { /* c==EOF */
4787 if (c != EOF) { /* c==EOF */
4793 /* mimeout_f != FIXED_MIME */
4795 if (c == EOF) { /* c==EOF */
4796 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
4797 j = mimeout_state.count;
4798 mimeout_state.count = 0;
4800 if (mimeout_mode > 0) {
4801 if (!nkf_isblank(mimeout_state.buf[j-1])) {
4803 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
4806 mimeout_addchar(mimeout_state.buf[i]);
4810 mimeout_addchar(mimeout_state.buf[i]);
4814 mimeout_addchar(mimeout_state.buf[i]);
4820 mimeout_addchar(mimeout_state.buf[i]);
4826 if (mimeout_state.count > 0){
4827 lastchar = mimeout_state.buf[mimeout_state.count - 1];
4832 if (mimeout_mode=='Q') {
4833 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4834 if (c == CR || c == LF) {
4839 } else if (c <= SP) {
4841 if (base64_count > 70) {
4842 PUT_NEWLINE((*o_mputc));
4845 if (!nkf_isblank(c)) {
4850 if (base64_count > 70) {
4852 PUT_NEWLINE((*o_mputc));
4855 open_mime(output_mode);
4857 if (!nkf_noescape_mime(c)) {
4868 if (mimeout_mode <= 0) {
4869 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4870 if (nkf_isspace(c)) {
4872 if (mimeout_mode == -1) {
4875 if (c==CR || c==LF) {
4877 open_mime(output_mode);
4883 for (i=0;i<mimeout_state.count;i++) {
4884 (*o_mputc)(mimeout_state.buf[i]);
4885 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
4896 mimeout_state.buf[0] = (char)c;
4897 mimeout_state.count = 1;
4899 if (base64_count > 1
4900 && base64_count + mimeout_state.count > 76
4901 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
4902 static const char *str = "boundary=\"";
4903 static int len = 10;
4906 for (; i < mimeout_state.count - len; ++i) {
4907 if (!strncmp(mimeout_state.buf+i, str, len)) {
4913 if (i == 0 || i == mimeout_state.count - len) {
4914 PUT_NEWLINE((*o_mputc));
4916 if (!nkf_isspace(mimeout_state.buf[0])){
4923 for (j = 0; j <= i; ++j) {
4924 (*o_mputc)(mimeout_state.buf[j]);
4926 PUT_NEWLINE((*o_mputc));
4928 for (; j <= mimeout_state.count; ++j) {
4929 mimeout_state.buf[j - i] = mimeout_state.buf[j];
4931 mimeout_state.count -= i;
4934 mimeout_state.buf[mimeout_state.count++] = (char)c;
4935 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4936 open_mime(output_mode);
4941 if (lastchar==CR || lastchar == LF){
4942 for (i=0;i<mimeout_state.count;i++) {
4943 (*o_mputc)(mimeout_state.buf[i]);
4946 mimeout_state.count = 0;
4949 for (i=0;i<mimeout_state.count-1;i++) {
4950 (*o_mputc)(mimeout_state.buf[i]);
4953 mimeout_state.buf[0] = SP;
4954 mimeout_state.count = 1;
4956 open_mime(output_mode);
4959 /* mimeout_mode == 'B', 1, 2 */
4960 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
4961 if (lastchar == CR || lastchar == LF){
4962 if (nkf_isblank(c)) {
4963 for (i=0;i<mimeout_state.count;i++) {
4964 mimeout_addchar(mimeout_state.buf[i]);
4966 mimeout_state.count = 0;
4967 } else if (SP<c && c<DEL) {
4969 for (i=0;i<mimeout_state.count;i++) {
4970 (*o_mputc)(mimeout_state.buf[i]);
4973 mimeout_state.count = 0;
4975 mimeout_state.buf[mimeout_state.count++] = (char)c;
4978 if (c==SP || c==TAB || c==CR || c==LF) {
4979 for (i=0;i<mimeout_state.count;i++) {
4980 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
4982 for (i=0;i<mimeout_state.count;i++) {
4983 (*o_mputc)(mimeout_state.buf[i]);
4986 mimeout_state.count = 0;
4989 mimeout_state.buf[mimeout_state.count++] = (char)c;
4990 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
4992 for (i=0;i<mimeout_state.count;i++) {
4993 (*o_mputc)(mimeout_state.buf[i]);
4996 mimeout_state.count = 0;
5000 if (mimeout_state.count>0 && SP<c && c!='=') {
5001 mimeout_state.buf[mimeout_state.count++] = (char)c;
5002 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5003 j = mimeout_state.count;
5004 mimeout_state.count = 0;
5006 mimeout_addchar(mimeout_state.buf[i]);
5013 if (mimeout_state.count>0) {
5014 j = mimeout_state.count;
5015 mimeout_state.count = 0;
5017 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5019 mimeout_addchar(mimeout_state.buf[i]);
5025 (*o_mputc)(mimeout_state.buf[i]);
5027 open_mime(output_mode);
5034 base64_conv(nkf_char c2, nkf_char c1)
5036 mime_prechar(c2, c1);
5037 (*o_base64conv)(c2,c1);
5041 typedef struct nkf_iconv_t {
5044 size_t input_buffer_size;
5045 char *output_buffer;
5046 size_t output_buffer_size;
5050 nkf_iconv_new(char *tocode, char *fromcode)
5052 nkf_iconv_t converter;
5054 converter->input_buffer_size = IOBUF_SIZE;
5055 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5056 converter->output_buffer_size = IOBUF_SIZE * 2;
5057 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5058 converter->cd = iconv_open(tocode, fromcode);
5059 if (converter->cd == (iconv_t)-1)
5063 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5066 perror("can't iconv_open");
5072 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5074 size_t invalid = (size_t)0;
5075 char *input_buffer = converter->input_buffer;
5076 size_t input_length = (size_t)0;
5077 char *output_buffer = converter->output_buffer;
5078 size_t output_length = converter->output_buffer_size;
5083 while ((c = (*i_getc)(f)) != EOF) {
5084 input_buffer[input_length++] = c;
5085 if (input_length < converter->input_buffer_size) break;
5089 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5090 while (output_length-- > 0) {
5091 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5093 if (ret == (size_t) - 1) {
5096 if (input_buffer != converter->input_buffer)
5097 memmove(converter->input_buffer, input_buffer, input_length);
5100 converter->output_buffer_size *= 2;
5101 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5102 if (output_buffer == NULL) {
5103 perror("can't realloc");
5106 converter->output_buffer = output_buffer;
5109 perror("can't iconv");
5122 nkf_iconv_close(nkf_iconv_t *convert)
5124 nkf_xfree(converter->inbuf);
5125 nkf_xfree(converter->outbuf);
5126 iconv_close(converter->cd);
5135 struct input_code *p = input_code_list;
5147 mime_f = MIME_DECODE_DEFAULT;
5148 mime_decode_f = FALSE;
5153 x0201_f = X0201_DEFAULT;
5154 iso2022jp_f = FALSE;
5155 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5156 ms_ucs_map_f = UCS_MAP_ASCII;
5158 #ifdef UTF8_INPUT_ENABLE
5159 no_cp932ext_f = FALSE;
5160 no_best_fit_chars_f = FALSE;
5161 encode_fallback = NULL;
5162 unicode_subchar = '?';
5163 input_endian = ENDIAN_BIG;
5165 #ifdef UTF8_OUTPUT_ENABLE
5166 output_bom_f = FALSE;
5167 output_endian = ENDIAN_BIG;
5169 #ifdef UNICODE_NORMALIZATION
5185 #ifdef SHIFTJIS_CP932
5195 for (i = 0; i < 256; i++){
5196 prefix_table[i] = 0;
5200 mimeout_state.count = 0;
5205 fold_preserve_f = FALSE;
5208 kanji_intro = DEFAULT_J;
5209 ascii_intro = DEFAULT_R;
5210 fold_margin = FOLD_MARGIN;
5211 o_zconv = no_connection;
5212 o_fconv = no_connection;
5213 o_eol_conv = no_connection;
5214 o_rot_conv = no_connection;
5215 o_hira_conv = no_connection;
5216 o_base64conv = no_connection;
5217 o_iso2022jp_check_conv = no_connection;
5220 i_ungetc = std_ungetc;
5222 i_bungetc = std_ungetc;
5225 i_mungetc = std_ungetc;
5226 i_mgetc_buf = std_getc;
5227 i_mungetc_buf = std_ungetc;
5228 output_mode = ASCII;
5230 mime_decode_mode = FALSE;
5236 init_broken_state();
5237 z_prev2=0,z_prev1=0;
5239 iconv_for_check = 0;
5241 input_codename = NULL;
5242 input_encoding = NULL;
5243 output_encoding = NULL;
5250 module_connection(void)
5252 if (input_encoding) set_input_encoding(input_encoding);
5253 if (!output_encoding) {
5254 output_encoding = nkf_default_encoding();
5256 if (!output_encoding) {
5257 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5260 set_output_encoding(output_encoding);
5261 oconv = nkf_enc_to_oconv(output_encoding);
5264 /* replace continucation module, from output side */
5266 /* output redicrection */
5268 if (noout_f || guess_f){
5275 if (mimeout_f == TRUE) {
5276 o_base64conv = oconv; oconv = base64_conv;
5278 /* base64_count = 0; */
5281 if (eolmode_f || guess_f) {
5282 o_eol_conv = oconv; oconv = eol_conv;
5285 o_rot_conv = oconv; oconv = rot_conv;
5288 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5291 o_hira_conv = oconv; oconv = hira_conv;
5294 o_fconv = oconv; oconv = fold_conv;
5297 if (alpha_f || x0201_f) {
5298 o_zconv = oconv; oconv = z_conv;
5302 i_ungetc = std_ungetc;
5303 /* input redicrection */
5306 i_cgetc = i_getc; i_getc = cap_getc;
5307 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5310 i_ugetc = i_getc; i_getc = url_getc;
5311 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5314 #ifdef NUMCHAR_OPTION
5316 i_ngetc = i_getc; i_getc = numchar_getc;
5317 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5320 #ifdef UNICODE_NORMALIZATION
5322 i_nfc_getc = i_getc; i_getc = nfc_getc;
5323 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5326 if (mime_f && mimebuf_f==FIXED_MIME) {
5327 i_mgetc = i_getc; i_getc = mime_getc;
5328 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5331 i_bgetc = i_getc; i_getc = broken_getc;
5332 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5334 if (input_encoding) {
5335 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5337 set_iconv(FALSE, e_iconv);
5341 struct input_code *p = input_code_list;
5350 Conversion main loop. Code detection only.
5353 #if !defined(PERL_XS) && !defined(WIN32DLL)
5360 module_connection();
5361 while ((c = (*i_getc)(f)) != EOF)
5368 #define NEXT continue /* no output, get next */
5369 #define SKIP c2=0;continue /* no output, get next */
5370 #define MORE c2=c1;continue /* need one more byte */
5371 #define SEND ; /* output c1 and c2, get next */
5372 #define LAST break /* end of loop, go closing */
5373 #define set_input_mode(mode) do { \
5374 input_mode = mode; \
5376 set_input_codename("ISO-2022-JP"); \
5377 debug("ISO-2022-JP"); \
5381 kanji_convert(FILE *f)
5383 nkf_char c1=0, c2=0, c3=0, c4=0;
5384 int shift_mode = 0; /* 0, 1, 2, 3 */
5386 int is_8bit = FALSE;
5388 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5393 output_mode = ASCII;
5395 if (module_connection() < 0) {
5396 #if !defined(PERL_XS) && !defined(WIN32DLL)
5397 fprintf(stderr, "no output encoding given\n");
5403 #ifdef UTF8_INPUT_ENABLE
5404 if(iconv == w_iconv32){
5405 while ((c1 = (*i_getc)(f)) != EOF &&
5406 (c2 = (*i_getc)(f)) != EOF &&
5407 (c3 = (*i_getc)(f)) != EOF &&
5408 (c4 = (*i_getc)(f)) != EOF) {
5409 nkf_iconv_utf_32(c1, c2, c3, c4);
5411 (*i_ungetc)(EOF, f);
5413 else if (iconv == w_iconv16) {
5414 while ((c1 = (*i_getc)(f)) != EOF &&
5415 (c2 = (*i_getc)(f)) != EOF) {
5416 if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5417 (c3 = (*i_getc)(f)) != EOF &&
5418 (c4 = (*i_getc)(f)) != EOF) {
5419 nkf_iconv_utf_16(c1, c2, c3, c4);
5422 (*i_ungetc)(EOF, f);
5426 while ((c1 = (*i_getc)(f)) != EOF) {
5427 #ifdef INPUT_CODE_FIX
5428 if (!input_encoding)
5434 /* in case of 8th bit is on */
5435 if (!estab_f&&!mime_decode_mode) {
5436 /* in case of not established yet */
5437 /* It is still ambiguious */
5438 if (h_conv(f, c2, c1)==EOF) {
5446 /* in case of already established */
5448 /* ignore bogus code */
5456 /* 2nd byte of 7 bit code or SJIS */
5460 else if (nkf_char_unicode_p(c1)) {
5466 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5469 } else if (c1 > DEL) {
5471 if (!estab_f && !iso8859_f) {
5472 /* not established yet */
5474 } else { /* estab_f==TRUE */
5480 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5481 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5483 c2 = JIS_X_0201_1976_K;
5488 /* already established */
5492 } else if (SP < c1 && c1 < DEL) {
5493 /* in case of Roman characters */
5495 /* output 1 shifted byte */
5499 } else if (nkf_byte_jisx0201_katakana_p(c1)){
5500 /* output 1 shifted byte */
5501 c2 = JIS_X_0201_1976_K;
5504 /* look like bogus code */
5507 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
5508 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
5509 /* in case of Kanji shifted */
5511 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
5512 /* Check MIME code */
5513 if ((c1 = (*i_getc)(f)) == EOF) {
5516 } else if (c1 == '?') {
5517 /* =? is mime conversion start sequence */
5518 if(mime_f == STRICT_MIME) {
5519 /* check in real detail */
5520 if (mime_begin_strict(f) == EOF)
5523 } else if (mime_begin(f) == EOF)
5532 /* normal ASCII code */
5535 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
5538 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
5541 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
5542 if ((c1 = (*i_getc)(f)) == EOF) {
5543 /* (*oconv)(0, ESC); don't send bogus code */
5546 else if (c1 == '&') {
5548 if ((c1 = (*i_getc)(f)) == EOF) {
5554 else if (c1 == '$') {
5556 if ((c1 = (*i_getc)(f)) == EOF) {
5557 /* don't send bogus code
5559 (*oconv)(0, '$'); */
5561 } else if (c1 == '@' || c1 == 'B') {
5563 set_input_mode(JIS_X_0208);
5565 } else if (c1 == '(') {
5567 if ((c1 = (*i_getc)(f)) == EOF) {
5568 /* don't send bogus code
5574 } else if (c1 == '@'|| c1 == 'B') {
5576 set_input_mode(JIS_X_0208);
5579 } else if (c1 == 'D'){
5580 set_input_mode(JIS_X_0212);
5582 #endif /* X0212_ENABLE */
5583 } else if (c1 == 'O' || c1 == 'Q'){
5584 set_input_mode(JIS_X_0213_1);
5586 } else if (c1 == 'P'){
5587 set_input_mode(JIS_X_0213_2);
5590 /* could be some special code */
5597 } else if (broken_f&0x2) {
5598 /* accept any ESC-(-x as broken code ... */
5599 input_mode = JIS_X_0208;
5608 } else if (c1 == '(') {
5610 if ((c1 = (*i_getc)(f)) == EOF) {
5611 /* don't send bogus code
5613 (*oconv)(0, '('); */
5616 else if (c1 == 'I') {
5617 /* JIS X 0201 Katakana */
5618 set_input_mode(JIS_X_0201_1976_K);
5621 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
5622 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
5623 set_input_mode(ASCII);
5626 else if (broken_f&0x2) {
5627 set_input_mode(ASCII);
5636 else if (c1 == '.') {
5638 if ((c1 = (*i_getc)(f)) == EOF) {
5641 else if (c1 == 'A') {
5652 else if (c1 == 'N') {
5655 if (g2 == ISO_8859_1) {
5670 } else if (c1 == ESC && iconv == s_iconv) {
5671 /* ESC in Shift_JIS */
5672 if ((c1 = (*i_getc)(f)) == EOF) {
5673 /* (*oconv)(0, ESC); don't send bogus code */
5675 } else if (c1 == '$') {
5677 if ((c1 = (*i_getc)(f)) == EOF) {
5679 } else if (('E' <= c1 && c1 <= 'G') ||
5680 ('O' <= c1 && c1 <= 'Q')) {
5688 static const nkf_char jphone_emoji_first_table[7] =
5689 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
5690 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
5691 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5692 while (SP <= c1 && c1 <= 'z') {
5693 (*oconv)(0, c1 + c3);
5694 if ((c1 = (*i_getc)(f)) == EOF) LAST;
5709 } else if (c1 == LF || c1 == CR) {
5711 input_mode = ASCII; set_iconv(FALSE, 0);
5713 } else if (mime_decode_f && !mime_decode_mode){
5715 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
5723 } else { /* if (c1 == CR)*/
5724 if ((c1=(*i_getc)(f))!=EOF) {
5728 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
5748 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
5751 if ((c3 = (*i_getc)(f)) != EOF) {
5754 if ((c4 = (*i_getc)(f)) != EOF) {
5756 (*iconv)(c2, c1, c3|c4);
5761 /* 3 bytes EUC or UTF-8 */
5762 if ((c3 = (*i_getc)(f)) != EOF) {
5764 (*iconv)(c2, c1, c3);
5772 0x7F <= c2 && c2 <= 0x92 &&
5773 0x21 <= c1 && c1 <= 0x7E) {
5775 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
5778 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
5782 (*oconv)(PREFIX_EUCG3 | c2, c1);
5784 #endif /* X0212_ENABLE */
5786 (*oconv)(PREFIX_EUCG3 | c2, c1);
5789 (*oconv)(input_mode, c1); /* other special case */
5795 /* goto next_word */
5799 (*iconv)(EOF, 0, 0);
5800 if (!input_codename)
5803 struct input_code *p = input_code_list;
5804 struct input_code *result = p;
5806 if (p->score < result->score) result = p;
5809 set_input_codename(result->name);
5811 debug(result->name);
5819 * int options(unsigned char *cp)
5826 options(unsigned char *cp)
5830 unsigned char *cp_back = NULL;
5835 while(*cp && *cp++!='-');
5836 while (*cp || cp_back) {
5844 case '-': /* literal options */
5845 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
5849 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5850 p = (unsigned char *)long_option[i].name;
5851 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5852 if (*p == cp[j] || cp[j] == SP){
5859 #if !defined(PERL_XS) && !defined(WIN32DLL)
5860 fprintf(stderr, "unknown long option: --%s\n", cp);
5864 while(*cp && *cp != SP && cp++);
5865 if (long_option[i].alias[0]){
5867 cp = (unsigned char *)long_option[i].alias;
5869 if (strcmp(long_option[i].name, "ic=") == 0){
5870 enc = nkf_enc_find((char *)p);
5872 input_encoding = enc;
5875 if (strcmp(long_option[i].name, "oc=") == 0){
5876 enc = nkf_enc_find((char *)p);
5877 /* if (enc <= 0) continue; */
5879 output_encoding = enc;
5882 if (strcmp(long_option[i].name, "guess=") == 0){
5883 if (p[0] == '0' || p[0] == '1') {
5891 if (strcmp(long_option[i].name, "overwrite") == 0){
5894 preserve_time_f = TRUE;
5897 if (strcmp(long_option[i].name, "overwrite=") == 0){
5900 preserve_time_f = TRUE;
5902 backup_suffix = (char *)p;
5905 if (strcmp(long_option[i].name, "in-place") == 0){
5908 preserve_time_f = FALSE;
5911 if (strcmp(long_option[i].name, "in-place=") == 0){
5914 preserve_time_f = FALSE;
5916 backup_suffix = (char *)p;
5921 if (strcmp(long_option[i].name, "cap-input") == 0){
5925 if (strcmp(long_option[i].name, "url-input") == 0){
5930 #ifdef NUMCHAR_OPTION
5931 if (strcmp(long_option[i].name, "numchar-input") == 0){
5937 if (strcmp(long_option[i].name, "no-output") == 0){
5941 if (strcmp(long_option[i].name, "debug") == 0){
5946 if (strcmp(long_option[i].name, "cp932") == 0){
5947 #ifdef SHIFTJIS_CP932
5951 #ifdef UTF8_OUTPUT_ENABLE
5952 ms_ucs_map_f = UCS_MAP_CP932;
5956 if (strcmp(long_option[i].name, "no-cp932") == 0){
5957 #ifdef SHIFTJIS_CP932
5961 #ifdef UTF8_OUTPUT_ENABLE
5962 ms_ucs_map_f = UCS_MAP_ASCII;
5966 #ifdef SHIFTJIS_CP932
5967 if (strcmp(long_option[i].name, "cp932inv") == 0){
5974 if (strcmp(long_option[i].name, "x0212") == 0){
5981 if (strcmp(long_option[i].name, "exec-in") == 0){
5985 if (strcmp(long_option[i].name, "exec-out") == 0){
5990 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
5991 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
5992 no_cp932ext_f = TRUE;
5995 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
5996 no_best_fit_chars_f = TRUE;
5999 if (strcmp(long_option[i].name, "fb-skip") == 0){
6000 encode_fallback = NULL;
6003 if (strcmp(long_option[i].name, "fb-html") == 0){
6004 encode_fallback = encode_fallback_html;
6007 if (strcmp(long_option[i].name, "fb-xml") == 0){
6008 encode_fallback = encode_fallback_xml;
6011 if (strcmp(long_option[i].name, "fb-java") == 0){
6012 encode_fallback = encode_fallback_java;
6015 if (strcmp(long_option[i].name, "fb-perl") == 0){
6016 encode_fallback = encode_fallback_perl;
6019 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6020 encode_fallback = encode_fallback_subchar;
6023 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6024 encode_fallback = encode_fallback_subchar;
6025 unicode_subchar = 0;
6027 /* decimal number */
6028 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6029 unicode_subchar *= 10;
6030 unicode_subchar += hex2bin(p[i]);
6032 }else if(p[1] == 'x' || p[1] == 'X'){
6033 /* hexadecimal number */
6034 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6035 unicode_subchar <<= 4;
6036 unicode_subchar |= hex2bin(p[i]);
6040 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6041 unicode_subchar *= 8;
6042 unicode_subchar += hex2bin(p[i]);
6045 w16e_conv(unicode_subchar, &i, &j);
6046 unicode_subchar = i<<8 | j;
6050 #ifdef UTF8_OUTPUT_ENABLE
6051 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6052 ms_ucs_map_f = UCS_MAP_MS;
6056 #ifdef UNICODE_NORMALIZATION
6057 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6062 if (strcmp(long_option[i].name, "prefix=") == 0){
6063 if (nkf_isgraph(p[0])){
6064 for (i = 1; nkf_isgraph(p[i]); i++){
6065 prefix_table[p[i]] = p[0];
6070 #if !defined(PERL_XS) && !defined(WIN32DLL)
6071 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6076 case 'b': /* buffered mode */
6079 case 'u': /* non bufferd mode */
6082 case 't': /* transparent mode */
6087 } else if (*cp=='2') {
6091 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6099 case 'j': /* JIS output */
6101 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6103 case 'e': /* AT&T EUC output */
6104 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6106 case 's': /* SJIS output */
6107 output_encoding = nkf_enc_from_index(WINDOWS_31J);
6109 case 'l': /* ISO8859 Latin-1 support, no conversion */
6110 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6111 input_encoding = nkf_enc_from_index(ISO_8859_1);
6113 case 'i': /* Kanji IN ESC-$-@/B */
6114 if (*cp=='@'||*cp=='B')
6115 kanji_intro = *cp++;
6117 case 'o': /* ASCII IN ESC-(-J/B */
6118 if (*cp=='J'||*cp=='B'||*cp=='H')
6119 ascii_intro = *cp++;
6123 bit:1 katakana->hiragana
6124 bit:2 hiragana->katakana
6126 if ('9'>= *cp && *cp>='0')
6127 hira_f |= (*cp++ -'0');
6134 #if defined(MSDOS) || defined(__OS2__)
6141 show_configuration();
6149 #ifdef UTF8_OUTPUT_ENABLE
6150 case 'w': /* UTF-8 output */
6155 output_encoding = nkf_enc_from_index(UTF_8N);
6157 output_bom_f = TRUE;
6158 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6162 if ('1'== cp[0] && '6'==cp[1]) {
6165 } else if ('3'== cp[0] && '2'==cp[1]) {
6169 output_encoding = nkf_enc_from_index(UTF_8);
6174 output_endian = ENDIAN_LITTLE;
6175 } else if (cp[0] == 'B') {
6178 output_encoding = nkf_enc_from_index(enc_idx);
6183 enc_idx = enc_idx == UTF_16
6184 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6185 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6187 output_bom_f = TRUE;
6188 enc_idx = enc_idx == UTF_16
6189 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6190 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6192 output_encoding = nkf_enc_from_index(enc_idx);
6196 #ifdef UTF8_INPUT_ENABLE
6197 case 'W': /* UTF input */
6200 input_encoding = nkf_enc_from_index(UTF_8);
6203 if ('1'== cp[0] && '6'==cp[1]) {
6205 input_endian = ENDIAN_BIG;
6207 } else if ('3'== cp[0] && '2'==cp[1]) {
6209 input_endian = ENDIAN_BIG;
6212 input_encoding = nkf_enc_from_index(UTF_8);
6217 input_endian = ENDIAN_LITTLE;
6218 } else if (cp[0] == 'B') {
6220 input_endian = ENDIAN_BIG;
6222 enc_idx = (enc_idx == UTF_16
6223 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6224 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6225 input_encoding = nkf_enc_from_index(enc_idx);
6229 /* Input code assumption */
6230 case 'J': /* ISO-2022-JP input */
6231 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6233 case 'E': /* EUC-JP input */
6234 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6236 case 'S': /* Windows-31J input */
6237 input_encoding = nkf_enc_from_index(WINDOWS_31J);
6239 case 'Z': /* Convert X0208 alphabet to asii */
6241 bit:0 Convert JIS X 0208 Alphabet to ASCII
6242 bit:1 Convert Kankaku to one space
6243 bit:2 Convert Kankaku to two spaces
6244 bit:3 Convert HTML Entity
6245 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6247 while ('0'<= *cp && *cp <='9') {
6248 alpha_f |= 1 << (*cp++ - '0');
6250 if (!alpha_f) alpha_f = 1;
6252 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6253 x0201_f = FALSE; /* No X0201->X0208 conversion */
6255 ESC-(-I in JIS, EUC, MS Kanji
6256 SI/SO in JIS, EUC, MS Kanji
6257 SS2 in EUC, JIS, not in MS Kanji
6258 MS Kanji (0xa0-0xdf)
6260 ESC-(-I in JIS (0x20-0x5f)
6261 SS2 in EUC (0xa0-0xdf)
6262 0xa0-0xd in MS Kanji (0xa0-0xdf)
6265 case 'X': /* Convert X0201 kana to X0208 */
6268 case 'F': /* prserve new lines */
6269 fold_preserve_f = TRUE;
6270 case 'f': /* folding -f60 or -f */
6273 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6275 fold_len += *cp++ - '0';
6277 if (!(0<fold_len && fold_len<BUFSIZ))
6278 fold_len = DEFAULT_FOLD;
6282 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6284 fold_margin += *cp++ - '0';
6288 case 'm': /* MIME support */
6289 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6290 if (*cp=='B'||*cp=='Q') {
6291 mime_decode_mode = *cp++;
6292 mimebuf_f = FIXED_MIME;
6293 } else if (*cp=='N') {
6294 mime_f = TRUE; cp++;
6295 } else if (*cp=='S') {
6296 mime_f = STRICT_MIME; cp++;
6297 } else if (*cp=='0') {
6298 mime_decode_f = FALSE;
6299 mime_f = FALSE; cp++;
6301 mime_f = STRICT_MIME;
6304 case 'M': /* MIME output */
6307 mimeout_f = FIXED_MIME; cp++;
6308 } else if (*cp=='Q') {
6310 mimeout_f = FIXED_MIME; cp++;
6315 case 'B': /* Broken JIS support */
6317 bit:1 allow any x on ESC-(-x or ESC-$-x
6318 bit:2 reset to ascii on NL
6320 if ('9'>= *cp && *cp>='0')
6321 broken_f |= 1<<(*cp++ -'0');
6326 case 'O':/* for Output file */
6330 case 'c':/* add cr code */
6333 case 'd':/* delete cr code */
6336 case 'I': /* ISO-2022-JP output */
6339 case 'L': /* line mode */
6340 if (*cp=='u') { /* unix */
6341 eolmode_f = LF; cp++;
6342 } else if (*cp=='m') { /* mac */
6343 eolmode_f = CR; cp++;
6344 } else if (*cp=='w') { /* windows */
6345 eolmode_f = CRLF; cp++;
6346 } else if (*cp=='0') { /* no conversion */
6347 eolmode_f = 0; cp++;
6352 if ('2' <= *cp && *cp <= '9') {
6355 } else if (*cp == '0' || *cp == '1') {
6364 /* module muliple options in a string are allowed for Perl moudle */
6365 while(*cp && *cp++!='-');
6368 #if !defined(PERL_XS) && !defined(WIN32DLL)
6369 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6371 /* bogus option but ignored */
6379 #include "nkf32dll.c"
6380 #elif defined(PERL_XS)
6381 #else /* WIN32DLL */
6383 main(int argc, char **argv)
6388 char *outfname = NULL;
6391 #ifdef EASYWIN /*Easy Win */
6392 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6394 #ifdef DEFAULT_CODE_LOCALE
6395 setlocale(LC_CTYPE, "");
6397 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6398 cp = (unsigned char *)*argv;
6403 if (pipe(fds) < 0 || (pid = fork()) < 0){
6414 execvp(argv[1], &argv[1]);
6431 int debug_f_back = debug_f;
6434 int exec_f_back = exec_f;
6437 int x0212_f_back = x0212_f;
6439 int x0213_f_back = x0213_f;
6440 int guess_f_back = guess_f;
6442 guess_f = guess_f_back;
6445 debug_f = debug_f_back;
6448 exec_f = exec_f_back;
6450 x0212_f = x0212_f_back;
6451 x0213_f = x0213_f_back;
6454 if (binmode_f == TRUE)
6455 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6456 if (freopen("","wb",stdout) == NULL)
6463 setbuf(stdout, (char *) NULL);
6465 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
6468 if (binmode_f == TRUE)
6469 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6470 if (freopen("","rb",stdin) == NULL) return (-1);
6474 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
6478 kanji_convert(stdin);
6479 if (guess_f) print_guessed_code(NULL);
6483 int is_argument_error = FALSE;
6485 input_codename = NULL;
6488 iconv_for_check = 0;
6490 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
6492 is_argument_error = TRUE;
6500 /* reopen file for stdout */
6501 if (file_out_f == TRUE) {
6504 outfname = nkf_xmalloc(strlen(origfname)
6505 + strlen(".nkftmpXXXXXX")
6507 strcpy(outfname, origfname);
6511 for (i = strlen(outfname); i; --i){
6512 if (outfname[i - 1] == '/'
6513 || outfname[i - 1] == '\\'){
6519 strcat(outfname, "ntXXXXXX");
6521 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
6522 S_IREAD | S_IWRITE);
6524 strcat(outfname, ".nkftmpXXXXXX");
6525 fd = mkstemp(outfname);
6528 || (fd_backup = dup(fileno(stdout))) < 0
6529 || dup2(fd, fileno(stdout)) < 0
6540 outfname = "nkf.out";
6543 if(freopen(outfname, "w", stdout) == NULL) {
6547 if (binmode_f == TRUE) {
6548 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6549 if (freopen("","wb",stdout) == NULL)
6556 if (binmode_f == TRUE)
6557 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
6558 if (freopen("","rb",fin) == NULL)
6563 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
6567 char *filename = NULL;
6569 if (nfiles > 1) filename = origfname;
6570 if (guess_f) print_guessed_code(filename);
6576 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6584 if (dup2(fd_backup, fileno(stdout)) < 0){
6587 if (stat(origfname, &sb)) {
6588 fprintf(stderr, "Can't stat %s\n", origfname);
6590 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
6591 if (chmod(outfname, sb.st_mode)) {
6592 fprintf(stderr, "Can't set permission %s\n", outfname);
6595 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
6596 if(preserve_time_f){
6597 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
6598 tb[0] = tb[1] = sb.st_mtime;
6599 if (utime(outfname, tb)) {
6600 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6603 tb.actime = sb.st_atime;
6604 tb.modtime = sb.st_mtime;
6605 if (utime(outfname, &tb)) {
6606 fprintf(stderr, "Can't set timestamp %s\n", outfname);
6611 char *backup_filename = get_backup_filename(backup_suffix, origfname);
6613 unlink(backup_filename);
6615 if (rename(origfname, backup_filename)) {
6616 perror(backup_filename);
6617 fprintf(stderr, "Can't rename %s to %s\n",
6618 origfname, backup_filename);
6620 nkf_xfree(backup_filename);
6623 if (unlink(origfname)){
6628 if (rename(outfname, origfname)) {
6630 fprintf(stderr, "Can't rename %s to %s\n",
6631 outfname, origfname);
6633 nkf_xfree(outfname);
6638 if (is_argument_error)
6641 #ifdef EASYWIN /*Easy Win */
6642 if (file_out_f == FALSE)
6643 scanf("%d",&end_check);
6646 #else /* for Other OS */
6647 if (file_out_f == TRUE)
6649 #endif /*Easy Win */
6652 #endif /* WIN32DLL */