X-Git-Url: http://git.sourceforge.jp/view?p=nkf%2Fnkf.git;a=blobdiff_plain;f=nkf.c;h=b58c437d3cc0da1fd87a69c19feec8d0e6fe595f;hp=7a7ebf08363d78100b66ab0e8045e2c1dfc2a981;hb=9fd04d0dfbeff6ac8cb877b9853fb09af6811f33;hpb=6f25b7241069edf1e94a6f3af21378932eb6048e diff --git a/nkf.c b/nkf.c index 7a7ebf0..b58c437 100644 --- a/nkf.c +++ b/nkf.c @@ -1,6 +1,6 @@ /* * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA). - * Copyright (c) 1996-2009, The nkf Project. + * Copyright (c) 1996-2013, The nkf Project. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -20,11 +20,11 @@ * * 3. This notice may not be removed or altered from any source distribution. */ -#define NKF_VERSION "2.0.9" -#define NKF_RELEASE_DATE "2009-04-26" +#define NKF_VERSION "2.1.4" +#define NKF_RELEASE_DATE "2015-12-12" #define COPY_RIGHT \ "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \ - "Copyright (C) 1996-2009, The nkf Project." + "Copyright (C) 1996-2015, The nkf Project." #include "config.h" #include "nkf.h" @@ -210,6 +210,8 @@ struct { } encoding_name_to_id_table[] = { {"US-ASCII", ASCII}, {"ASCII", ASCII}, + {"646", ASCII}, + {"ROMAN8", ASCII}, {"ISO-2022-JP", ISO_2022_JP}, {"ISO2022JP-CP932", CP50220}, {"CP50220", CP50220}, @@ -221,6 +223,8 @@ struct { {"ISO-2022-JP-2004", ISO_2022_JP_2004}, {"SHIFT_JIS", SHIFT_JIS}, {"SJIS", SHIFT_JIS}, + {"MS_Kanji", SHIFT_JIS}, + {"PCK", SHIFT_JIS}, {"WINDOWS-31J", WINDOWS_31J}, {"CSWINDOWS31J", WINDOWS_31J}, {"CP932", WINDOWS_31J}, @@ -295,7 +299,7 @@ struct { && (c != '(') && (c != ')') && (c != '.') && (c != 0x22))) #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END) -#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F)) +#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F) #define HOLD_SIZE 1024 #if defined(INT_IS_SHORT) @@ -352,6 +356,7 @@ static int no_cp932ext_f = FALSE; /* ignore ZERO WIDTH NO-BREAK SPACE */ static int no_best_fit_chars_f = FALSE; static int input_endian = ENDIAN_BIG; +static int input_bom_f = FALSE; static nkf_char unicode_subchar = '?'; /* the regular substitution character */ static void (*encode_fallback)(nkf_char c) = NULL; static void w_status(struct input_code *, nkf_char); @@ -379,6 +384,8 @@ static unsigned char stdibuf[IOBUF_SIZE]; static unsigned char stdobuf[IOBUF_SIZE]; #endif +#define NKF_UNSPECIFIED (-TRUE) + /* flags */ static int unbuf_f = FALSE; static int estab_f = FALSE; @@ -393,7 +400,7 @@ static int mimebuf_f = FALSE; /* MIME buffered input */ static int broken_f = FALSE; /* convert ESC-less broken JIS */ static int iso8859_f = FALSE; /* ISO8859 through */ static int mimeout_f = FALSE; /* base64 mode */ -static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */ +static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */ static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */ #ifdef UNICODE_NORMALIZATION @@ -424,6 +431,8 @@ static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc; #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX) #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX) +#define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00)) + #ifdef NUMCHAR_OPTION static int numchar_f = FALSE; static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */ @@ -471,7 +480,7 @@ struct input_code input_code_list[] = { {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0}, {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0}, #endif - {0} + {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0} }; static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */ @@ -500,7 +509,7 @@ static int fold_margin = FOLD_MARGIN; /* process default */ static nkf_char -no_connection2(nkf_char c2, nkf_char c1, nkf_char c0) +no_connection2(ARG_UNUSED nkf_char c2, ARG_UNUSED nkf_char c1, ARG_UNUSED nkf_char c0) { fprintf(stderr,"nkf internal module connection failure.\n"); exit(EXIT_FAILURE); @@ -614,6 +623,27 @@ static const unsigned char ev[]= { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00}; +/* X0201 kana to X0213 conversion table for han-daguten */ +/* 90-9F A0-DF */ +static const unsigned char ev_x0213[]= { + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x25,0x77,0x25,0x78, + 0x25,0x79,0x25,0x7a,0x25,0x7b,0x00,0x00, + 0x00,0x00,0x00,0x00,0x25,0x7c,0x00,0x00, + 0x00,0x00,0x00,0x00,0x25,0x7d,0x00,0x00, + 0x25,0x7e,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00}; + /* X0208 kigou conversion table */ /* 0x8140 - 0x819e */ @@ -744,7 +774,7 @@ nkf_enc_find(const char *name) #ifdef DEFAULT_CODE_LOCALE static const char* -nkf_locale_charmap() +nkf_locale_charmap(void) { #ifdef HAVE_LANGINFO_H return nl_langinfo(CODESET); @@ -772,7 +802,7 @@ nkf_locale_charmap() } static nkf_encoding* -nkf_locale_encoding() +nkf_locale_encoding(void) { nkf_encoding *enc = 0; const char *encname = nkf_locale_charmap(); @@ -783,13 +813,13 @@ nkf_locale_encoding() #endif /* DEFAULT_CODE_LOCALE */ static nkf_encoding* -nkf_utf8_encoding() +nkf_utf8_encoding(void) { return &nkf_encoding_table[UTF_8]; } static nkf_encoding* -nkf_default_encoding() +nkf_default_encoding(void) { nkf_encoding *enc = 0; #ifdef DEFAULT_CODE_LOCALE @@ -811,11 +841,11 @@ static nkf_buf_t * nkf_buf_new(int length) { nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t)); - buf->ptr = nkf_xmalloc(length); + buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length); buf->capa = length; buf->len = 0; return buf; -} +} #if 0 static void @@ -829,7 +859,7 @@ nkf_buf_dispose(nkf_buf_t *buf) #define nkf_buf_length(buf) ((buf)->len) #define nkf_buf_empty_p(buf) ((buf)->len == 0) -static unsigned char +static nkf_char nkf_buf_at(nkf_buf_t *buf, int index) { assert(index <= buf->len); @@ -851,7 +881,7 @@ nkf_buf_push(nkf_buf_t *buf, nkf_char c) buf->ptr[buf->len++] = c; } -static unsigned char +static nkf_char nkf_buf_pop(nkf_buf_t *buf) { assert(!nkf_buf_empty_p(buf)); @@ -896,7 +926,7 @@ usage(void) " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n" " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n" " 4: JISX0208 Katakana to JISX0201 Katakana\n" - " X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n" + " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n" ); fprintf(HELP_OUTPUT, " O Output to File (DEFAULT 'nkf.out')\n" @@ -1028,7 +1058,7 @@ nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c) int shift = 20; c &= VALUE_MASK; while(shift >= 0){ - if(c >= 1<= NKF_INT32_C(1)<= 0){ (*f)(0, bin2hex(c>>shift)); shift -= 4; @@ -1203,9 +1233,10 @@ set_input_encoding(nkf_encoding *enc) case ISO_8859_1: iso8859_f = TRUE; break; - case CP50220: case CP50221: case CP50222: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ + case CP50220: #ifdef SHIFTJIS_CP932 cp51932_f = TRUE; #endif @@ -1227,6 +1258,7 @@ set_input_encoding(nkf_encoding *enc) case SHIFT_JIS: break; case WINDOWS_31J: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = TRUE; #endif @@ -1248,6 +1280,7 @@ set_input_encoding(nkf_encoding *enc) case EUCJP_NKF: break; case CP51932: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = TRUE; #endif @@ -1256,6 +1289,7 @@ set_input_encoding(nkf_encoding *enc) #endif break; case EUCJP_MS: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = FALSE; #endif @@ -1264,6 +1298,7 @@ set_input_encoding(nkf_encoding *enc) #endif break; case EUCJP_ASCII: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = FALSE; #endif @@ -1276,6 +1311,7 @@ set_input_encoding(nkf_encoding *enc) x0213_f = TRUE; #ifdef SHIFTJIS_CP932 cp51932_f = FALSE; + if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif break; case EUC_JISX0213: @@ -1318,7 +1354,6 @@ set_output_encoding(nkf_encoding *enc) { switch (nkf_enc_to_index(enc)) { case CP50220: - x0201_f = TRUE; #ifdef SHIFTJIS_CP932 if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif @@ -1327,6 +1362,7 @@ set_output_encoding(nkf_encoding *enc) #endif break; case CP50221: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif @@ -1334,6 +1370,11 @@ set_output_encoding(nkf_encoding *enc) ms_ucs_map_f = UCS_MAP_CP932; #endif break; + case ISO_2022_JP: +#ifdef SHIFTJIS_CP932 + if (cp932inv_f == TRUE) cp932inv_f = FALSE; +#endif + break; case ISO_2022_JP_1: x0212_f = TRUE; #ifdef SHIFTJIS_CP932 @@ -1341,6 +1382,7 @@ set_output_encoding(nkf_encoding *enc) #endif break; case ISO_2022_JP_3: + case ISO_2022_JP_2004: x0212_f = TRUE; x0213_f = TRUE; #ifdef SHIFTJIS_CP932 @@ -1350,6 +1392,7 @@ set_output_encoding(nkf_encoding *enc) case SHIFT_JIS: break; case WINDOWS_31J: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef UTF8_OUTPUT_ENABLE ms_ucs_map_f = UCS_MAP_CP932; #endif @@ -1378,6 +1421,7 @@ set_output_encoding(nkf_encoding *enc) #endif break; case CP51932: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif @@ -1386,12 +1430,14 @@ set_output_encoding(nkf_encoding *enc) #endif break; case EUCJP_MS: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ x0212_f = TRUE; #ifdef UTF8_OUTPUT_ENABLE ms_ucs_map_f = UCS_MAP_MS; #endif break; case EUCJP_ASCII: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ x0212_f = TRUE; #ifdef UTF8_OUTPUT_ENABLE ms_ucs_map_f = UCS_MAP_ASCII; @@ -1520,13 +1566,26 @@ x0212_unshift(nkf_char c) } #endif /* X0212_ENABLE */ +static int +is_x0213_2_in_x0212(nkf_char c1) +{ + static const char x0213_2_table[] = + {0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1}; + int ku = c1 - 0x20; + if (ku <= 15) + return x0213_2_table[ku]; /* 1, 3-5, 8, 12-15 */ + if (78 <= ku && ku <= 94) + return 1; + return 0; +} + static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) { nkf_char ndx; if (is_eucg3(c2)){ ndx = c2 & 0x7f; - if (x0213_f){ + if (x0213_f && is_x0213_2_in_x0212(ndx)){ if((0x21 <= ndx && ndx <= 0x2F)){ if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3; if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); @@ -1572,7 +1631,7 @@ s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} }; if (0xFC < c1) return 1; #ifdef SHIFTJIS_CP932 - if (!cp932inv_f && is_ibmext_in_sjis(c2)){ + if (!cp932inv_f && !x0213_f && is_ibmext_in_sjis(c2)){ val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40]; if (val){ c2 = val >> 8; @@ -1675,7 +1734,7 @@ nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) /* single byte */ wc = c1; } - else if (c1 <= 0xC3) { + else if (c1 <= 0xC1) { /* trail byte or invalid */ return -1; } @@ -1815,6 +1874,7 @@ unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_c ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 : ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms : ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac : + x0213_f ? utf8_to_euc_2bytes_x0213 : utf8_to_euc_2bytes; ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1); }else if(c0 < 0xF0){ @@ -1882,6 +1942,7 @@ unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_c ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 : ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms : ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac : + x0213_f ? utf8_to_euc_3bytes_x0213 : utf8_to_euc_3bytes; ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1); }else return -1; @@ -1899,6 +1960,15 @@ unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_c } #ifdef UTF8_OUTPUT_ENABLE +#define X0213_SURROGATE_FIND(tbl, size, euc) do { \ + int i; \ + for (i = 0; i < size; i++) \ + if (tbl[i][0] == euc) { \ + low = tbl[i][2]; \ + break; \ + } \ + } while (0) + static nkf_char e2w_conv(nkf_char c2, nkf_char c1) { @@ -1921,7 +1991,9 @@ e2w_conv(nkf_char c2, nkf_char c1) } c2 = (c2&0x7f) - 0x21; if (0<=c2 && c2= sizeof_x0213_combining_chars) + return 0; + euc = (c2&0x7f)<<8 | (c1&0x7f); + for (i = 0; i < sizeof_x0213_combining_table; i++) + if (x0213_combining_table[i][0] == euc) + return x0213_combining_table[i][1]; return 0; } #endif @@ -1986,6 +2090,25 @@ w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1) } } else { + int i; + if (x0213_f) { + c1 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ + c2 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ + for (i = 0; i < sizeof_x0213_1_surrogate_table; i++) + if (x0213_1_surrogate_table[i][1] == c1 && x0213_1_surrogate_table[i][2] == c2) { + val = x0213_1_surrogate_table[i][0]; + *p2 = val >> 8; + *p1 = val & 0xFF; + return 0; + } + for (i = 0; i < sizeof_x0213_2_surrogate_table; i++) + if (x0213_2_surrogate_table[i][1] == c1 && x0213_2_surrogate_table[i][2] == c2) { + val = x0213_2_surrogate_table[i][0]; + *p2 = PREFIX_EUCG3 | (val >> 8); + *p1 = val & 0xFF; + return 0; + } + } *p2 = 0; *p1 = nkf_char_unicode_new(val); } @@ -2058,7 +2181,7 @@ e_iconv(nkf_char c2, nkf_char c1, nkf_char c0) } static nkf_char -s_iconv(nkf_char c2, nkf_char c1, nkf_char c0) +s_iconv(ARG_UNUSED nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0) { if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) { if (iso2022jp_f && !x0201_f) { @@ -2081,6 +2204,30 @@ s_iconv(nkf_char c2, nkf_char c1, nkf_char c0) return 0; } +static int +x0213_wait_combining_p(nkf_char wc) +{ + int i; + for (i = 0; i < sizeof_x0213_combining_table; i++) { + if (x0213_combining_table[i][1] == wc) { + return TRUE; + } + } + return FALSE; +} + +static int +x0213_combining_p(nkf_char wc) +{ + int i; + for (i = 0; i < sizeof_x0213_combining_chars; i++) { + if (x0213_combining_chars[i] == wc) { + return TRUE; + } + } + return FALSE; +} + static nkf_char w_iconv(nkf_char c1, nkf_char c2, nkf_char c3) { @@ -2148,6 +2295,8 @@ w_iconv(nkf_char c1, nkf_char c2, nkf_char c3) c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4)); c1 = 0; } else { + if (x0213_f && x0213_wait_combining_p(nkf_utf8_to_unicode(c1, c2, c3, c4))) + return -3; ret = w2e_conv(c1, c2, c3, &c1, &c2); } if (ret == 0){ @@ -2156,9 +2305,22 @@ w_iconv(nkf_char c1, nkf_char c2, nkf_char c3) return ret; } +static nkf_char +w_iconv_nocombine(nkf_char c1, nkf_char c2, nkf_char c3) +{ + /* continue from the line below 'return -3;' in w_iconv() */ + nkf_char ret = w2e_conv(c1, c2, c3, &c1, &c2); + if (ret == 0){ + (*oconv)(c1, c2); + } + return ret; +} + #define NKF_ICONV_INVALID_CODE_RANGE -13 +#define NKF_ICONV_WAIT_COMBINING_CHAR -14 +#define NKF_ICONV_NOT_COMBINED -15 static size_t -unicode_iconv(nkf_char wc) +unicode_iconv(nkf_char wc, int nocombine) { nkf_char c1, c2; int ret = 0; @@ -2170,6 +2332,8 @@ unicode_iconv(nkf_char wc) /* unpaired surrogate */ return NKF_ICONV_INVALID_CODE_RANGE; }else if (wc < 0xFFFF) { + if (!nocombine && x0213_f && x0213_wait_combining_p(wc)) + return NKF_ICONV_WAIT_COMBINING_CHAR; ret = w16e_conv(wc, &c2, &c1); if (ret) return ret; }else if (wc < 0x10FFFF) { @@ -2182,9 +2346,50 @@ unicode_iconv(nkf_char wc) return 0; } -#define NKF_ICONV_NEED_ONE_MORE_BYTE -1 -#define NKF_ICONV_NEED_TWO_MORE_BYTES -2 -#define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00)) +static nkf_char +unicode_iconv_combine(nkf_char wc, nkf_char wc2) +{ + nkf_char c1, c2; + int i; + + if (wc2 < 0x80) { + return NKF_ICONV_NOT_COMBINED; + }else if ((wc2>>11) == 27) { + /* unpaired surrogate */ + return NKF_ICONV_INVALID_CODE_RANGE; + }else if (wc2 < 0xFFFF) { + if (!x0213_combining_p(wc2)) + return NKF_ICONV_NOT_COMBINED; + for (i = 0; i < sizeof_x0213_combining_table; i++) { + if (x0213_combining_table[i][1] == wc && + x0213_combining_table[i][2] == wc2) { + c2 = x0213_combining_table[i][0] >> 8; + c1 = x0213_combining_table[i][0] & 0x7f; + (*oconv)(c2, c1); + return 0; + } + } + }else if (wc2 < 0x10FFFF) { + return NKF_ICONV_NOT_COMBINED; + } else { + return NKF_ICONV_INVALID_CODE_RANGE; + } + return NKF_ICONV_NOT_COMBINED; +} + +static nkf_char +w_iconv_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6) +{ + nkf_char wc, wc2; + wc = nkf_utf8_to_unicode(c1, c2, c3, 0); + wc2 = nkf_utf8_to_unicode(c4, c5, c6, 0); + if (wc2 < 0) + return wc2; + return unicode_iconv_combine(wc, wc2); +} + +#define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1 +#define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2 static size_t nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) { @@ -2213,31 +2418,63 @@ nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) } } - return (*unicode_iconv)(wc); + return (*unicode_iconv)(wc, FALSE); +} + +static size_t +nkf_iconv_utf_16_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) +{ + nkf_char wc, wc2; + + if (input_endian == ENDIAN_BIG) { + if (0xD8 <= c3 && c3 <= 0xDB) { + return NKF_ICONV_NOT_COMBINED; + } else { + wc = c1 << 8 | c2; + wc2 = c3 << 8 | c4; + } + } else { + if (0xD8 <= c2 && c2 <= 0xDB) { + return NKF_ICONV_NOT_COMBINED; + } else { + wc = c2 << 8 | c1; + wc2 = c4 << 8 | c3; + } + } + + return unicode_iconv_combine(wc, wc2); +} + +static size_t +nkf_iconv_utf_16_nocombine(nkf_char c1, nkf_char c2) +{ + nkf_char wc; + if (input_endian == ENDIAN_BIG) + wc = c1 << 8 | c2; + else + wc = c2 << 8 | c1; + return (*unicode_iconv)(wc, TRUE); } static nkf_char -w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0) +w_iconv16(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0) { - return 0; + (*oconv)(c2, c1); + return 16; /* different from w_iconv32 */ } static nkf_char -w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0) +w_iconv32(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0) { - return 0; + (*oconv)(c2, c1); + return 32; /* different from w_iconv16 */ } -static size_t -nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) +static nkf_char +utf32_to_nkf_char(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) { nkf_char wc; - if (c1 == EOF) { - (*oconv)(EOF, 0); - return 0; - } - switch(input_endian){ case ENDIAN_BIG: wc = c2 << 16 | c3 << 8 | c4; @@ -2254,8 +2491,48 @@ nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) default: return NKF_ICONV_INVALID_CODE_RANGE; } + return wc; +} - return (*unicode_iconv)(wc); +static size_t +nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) +{ + nkf_char wc; + + if (c1 == EOF) { + (*oconv)(EOF, 0); + return 0; + } + + wc = utf32_to_nkf_char(c1, c2, c3, c4); + if (wc < 0) + return wc; + + return (*unicode_iconv)(wc, FALSE); +} + +static nkf_char +nkf_iconv_utf_32_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6, nkf_char c7, nkf_char c8) +{ + nkf_char wc, wc2; + + wc = utf32_to_nkf_char(c1, c2, c3, c4); + if (wc < 0) + return wc; + wc2 = utf32_to_nkf_char(c5, c6, c7, c8); + if (wc2 < 0) + return wc2; + + return unicode_iconv_combine(wc, wc2); +} + +static size_t +nkf_iconv_utf_32_nocombine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) +{ + nkf_char wc; + + wc = utf32_to_nkf_char(c1, c2, c3, c4); + return (*unicode_iconv)(wc, TRUE); } #endif @@ -2510,11 +2787,19 @@ s_oconv(nkf_char c2, nkf_char c1) } #ifdef UTF8_OUTPUT_ENABLE +#define OUTPUT_UTF8(val) do { \ + nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); \ + (*o_putc)(c1); \ + if (c2) (*o_putc)(c2); \ + if (c3) (*o_putc)(c3); \ + if (c4) (*o_putc)(c4); \ + } while (0) + static void w_oconv(nkf_char c2, nkf_char c1) { nkf_char c3, c4; - nkf_char val; + nkf_char val, val2; if (output_bom_f) { output_bom_f = FALSE; @@ -2530,11 +2815,7 @@ w_oconv(nkf_char c2, nkf_char c1) if (c2 == 0 && nkf_char_unicode_p(c1)){ val = c1 & VALUE_MASK; - nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); - (*o_putc)(c1); - if (c2) (*o_putc)(c2); - if (c3) (*o_putc)(c3); - if (c4) (*o_putc)(c4); + OUTPUT_UTF8(val); return; } @@ -2543,27 +2824,46 @@ w_oconv(nkf_char c2, nkf_char c1) } else { val = e2w_conv(c2, c1); if (val){ - nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); - (*o_putc)(c1); - if (c2) (*o_putc)(c2); - if (c3) (*o_putc)(c3); - if (c4) (*o_putc)(c4); + val2 = e2w_combining(val, c2, c1); + if (val2) + OUTPUT_UTF8(val2); + OUTPUT_UTF8(val); } } } +#define OUTPUT_UTF16_BYTES(c1, c2) do { \ + if (output_endian == ENDIAN_LITTLE){ \ + (*o_putc)(c1); \ + (*o_putc)(c2); \ + }else{ \ + (*o_putc)(c2); \ + (*o_putc)(c1); \ + } \ + } while (0) + +#define OUTPUT_UTF16(val) do { \ + if (nkf_char_unicode_bmp_p(val)) { \ + c2 = (val >> 8) & 0xff; \ + c1 = val & 0xff; \ + OUTPUT_UTF16_BYTES(c1, c2); \ + } else { \ + val &= VALUE_MASK; \ + if (val <= UNICODE_MAX) { \ + c2 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ \ + c1 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ \ + OUTPUT_UTF16_BYTES(c2 & 0xff, (c2 >> 8) & 0xff); \ + OUTPUT_UTF16_BYTES(c1 & 0xff, (c1 >> 8) & 0xff); \ + } \ + } \ + } while (0) + static void w_oconv16(nkf_char c2, nkf_char c1) { if (output_bom_f) { output_bom_f = FALSE; - if (output_endian == ENDIAN_LITTLE){ - (*o_putc)(0xFF); - (*o_putc)(0xFE); - }else{ - (*o_putc)(0xFE); - (*o_putc)(0xFF); - } + OUTPUT_UTF16_BYTES(0xFF, 0xFE); } if (c2 == EOF) { @@ -2572,44 +2872,34 @@ w_oconv16(nkf_char c2, nkf_char c1) } if (c2 == 0 && nkf_char_unicode_p(c1)) { - if (nkf_char_unicode_bmp_p(c1)) { - c2 = (c1 >> 8) & 0xff; - c1 &= 0xff; - } else { - c1 &= VALUE_MASK; - if (c1 <= UNICODE_MAX) { - c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ - c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ - if (output_endian == ENDIAN_LITTLE){ - (*o_putc)(c2 & 0xff); - (*o_putc)((c2 >> 8) & 0xff); - (*o_putc)(c1 & 0xff); - (*o_putc)((c1 >> 8) & 0xff); - }else{ - (*o_putc)((c2 >> 8) & 0xff); - (*o_putc)(c2 & 0xff); - (*o_putc)((c1 >> 8) & 0xff); - (*o_putc)(c1 & 0xff); - } - } - return; - } + OUTPUT_UTF16(c1); } else if (c2) { - nkf_char val = e2w_conv(c2, c1); - c2 = (val >> 8) & 0xff; - c1 = val & 0xff; + nkf_char val, val2; + val = e2w_conv(c2, c1); if (!val) return; - } - - if (output_endian == ENDIAN_LITTLE){ - (*o_putc)(c1); - (*o_putc)(c2); - }else{ - (*o_putc)(c2); - (*o_putc)(c1); + val2 = e2w_combining(val, c2, c1); + if (val2) + OUTPUT_UTF16(val2); + OUTPUT_UTF16(val); + } else { + OUTPUT_UTF16_BYTES(c1, c2); } } +#define OUTPUT_UTF32(c) do { \ + if (output_endian == ENDIAN_LITTLE){ \ + (*o_putc)( (c) & 0xFF); \ + (*o_putc)(((c) >> 8) & 0xFF); \ + (*o_putc)(((c) >> 16) & 0xFF); \ + (*o_putc)(0); \ + }else{ \ + (*o_putc)(0); \ + (*o_putc)(((c) >> 16) & 0xFF); \ + (*o_putc)(((c) >> 8) & 0xFF); \ + (*o_putc)( (c) & 0xFF); \ + } \ + } while (0) + static void w_oconv32(nkf_char c2, nkf_char c1) { @@ -2638,20 +2928,15 @@ w_oconv32(nkf_char c2, nkf_char c1) } else if (c2 == 0 && nkf_char_unicode_p(c1)) { c1 &= VALUE_MASK; } else if (c2) { - c1 = e2w_conv(c2, c1); - if (!c1) return; - } - if (output_endian == ENDIAN_LITTLE){ - (*o_putc)( c1 & 0xFF); - (*o_putc)((c1 >> 8) & 0xFF); - (*o_putc)((c1 >> 16) & 0xFF); - (*o_putc)(0); - }else{ - (*o_putc)(0); - (*o_putc)((c1 >> 16) & 0xFF); - (*o_putc)((c1 >> 8) & 0xFF); - (*o_putc)( c1 & 0xFF); + nkf_char val, val2; + val = e2w_conv(c2, c1); + if (!val) return; + val2 = e2w_combining(val, c2, c1); + if (val2) + OUTPUT_UTF32(val2); + c1 = val; } + OUTPUT_UTF32(c1); } #endif @@ -2660,7 +2945,8 @@ w_oconv32(nkf_char c2, nkf_char c1) #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */ #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */ #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */ -#define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */ +#define SCORE_X0213 (SCORE_X0212 << 1) /* JIS X 0213 */ +#define SCORE_NO_EXIST (SCORE_X0213 << 1) /* Undefined Characters */ #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */ #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */ @@ -2670,14 +2956,35 @@ static const nkf_char score_table_A0[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, - SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST, + SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_X0213, }; static const nkf_char score_table_F0[] = { SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2, - SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, + SCORE_L2, SCORE_DEPEND, SCORE_X0213, SCORE_X0213, SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932, - SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR, + SCORE_CP932, SCORE_X0213, SCORE_X0213, SCORE_ERROR, +}; + +static const nkf_char score_table_8FA0[] = { + 0, SCORE_X0213, SCORE_X0212, SCORE_X0213, + SCORE_X0213, SCORE_X0213, SCORE_X0212, SCORE_X0212, + SCORE_X0213, SCORE_X0212, SCORE_X0212, SCORE_X0212, + SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213, +}; + +static const nkf_char score_table_8FE0[] = { + SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212, + SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212, + SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212, + SCORE_X0212, SCORE_X0212, SCORE_X0213, SCORE_X0213, +}; + +static const nkf_char score_table_8FF0[] = { + SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0212, + SCORE_X0212, SCORE_X0213, SCORE_X0213, SCORE_X0213, + SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213, + SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213, }; static void @@ -2700,15 +3007,21 @@ static void code_score(struct input_code *ptr) { nkf_char c2 = ptr->buf[0]; -#ifdef UTF8_OUTPUT_ENABLE nkf_char c1 = ptr->buf[1]; -#endif if (c2 < 0){ set_code_score(ptr, SCORE_ERROR); }else if (c2 == SS2){ set_code_score(ptr, SCORE_KANA); }else if (c2 == 0x8f){ - set_code_score(ptr, SCORE_X0212); + if ((c1 & 0x70) == 0x20){ + set_code_score(ptr, score_table_8FA0[c1 & 0x0f]); + }else if ((c1 & 0x70) == 0x60){ + set_code_score(ptr, score_table_8FE0[c1 & 0x0f]); + }else if ((c1 & 0x70) == 0x70){ + set_code_score(ptr, score_table_8FF0[c1 & 0x0f]); + }else{ + set_code_score(ptr, SCORE_X0212); + } #ifdef UTF8_OUTPUT_ENABLE }else if (!e2w_conv(c2, c1)){ set_code_score(ptr, SCORE_NO_EXIST); @@ -3024,7 +3337,7 @@ std_getc(FILE *f) #endif /*WIN32DLL*/ static nkf_char -std_ungetc(nkf_char c, FILE *f) +std_ungetc(nkf_char c, ARG_UNUSED FILE *f) { nkf_buf_push(nkf_state->std_gc_buf, c); return c; @@ -3039,23 +3352,24 @@ std_putc(nkf_char c) } #endif /*WIN32DLL*/ -static unsigned char hold_buf[HOLD_SIZE*2]; +static nkf_char hold_buf[HOLD_SIZE*2]; static int hold_count = 0; static nkf_char push_hold_buf(nkf_char c2) { if (hold_count >= HOLD_SIZE*2) return (EOF); - hold_buf[hold_count++] = (unsigned char)c2; + hold_buf[hold_count++] = c2; return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); } static int -h_conv(FILE *f, int c1, int c2) +h_conv(FILE *f, nkf_char c1, nkf_char c2) { - int ret, c4, c3; + int ret; int hold_index; - + int fromhold_count; + nkf_char c3, c4; /** it must NOT be in the kanji shifte sequence */ /** it must NOT be written in JIS7 */ @@ -3105,15 +3419,21 @@ h_conv(FILE *f, int c1, int c2) hold_index = 0; while (hold_index < hold_count){ c1 = hold_buf[hold_index++]; - if (c1 <= DEL){ + if (nkf_char_unicode_p(c1)) { + (*oconv)(0, c1); + continue; + } + else if (c1 <= DEL){ (*iconv)(0, c1, 0); continue; }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){ (*iconv)(JIS_X_0201_1976_K, c1, 0); continue; } + fromhold_count = 1; if (hold_index < hold_count){ c2 = hold_buf[hold_index++]; + fromhold_count++; }else{ c2 = (*i_getc)(f); if (c2 == EOF){ @@ -3131,16 +3451,47 @@ h_conv(FILE *f, int c1, int c2) } else if ((c3 = (*i_getc)(f)) == EOF) { ret = EOF; break; - } else { - code_status(c3); - if (hold_index < hold_count){ - c4 = hold_buf[hold_index++]; - } else if ((c4 = (*i_getc)(f)) == EOF) { - c3 = ret = EOF; - break; + } + code_status(c3); + if (hold_index < hold_count){ + c4 = hold_buf[hold_index++]; + } else if ((c4 = (*i_getc)(f)) == EOF) { + c3 = ret = EOF; + break; + } + code_status(c4); + (*iconv)(c1, c2, (c3<<8)|c4); + break; + case -3: + /* 4 bytes UTF-8 (check combining character) */ + if (hold_index < hold_count){ + c3 = hold_buf[hold_index++]; + fromhold_count++; + } else if ((c3 = (*i_getc)(f)) == EOF) { + w_iconv_nocombine(c1, c2, 0); + break; + } + if (hold_index < hold_count){ + c4 = hold_buf[hold_index++]; + fromhold_count++; + } else if ((c4 = (*i_getc)(f)) == EOF) { + w_iconv_nocombine(c1, c2, 0); + if (fromhold_count <= 2) + (*i_ungetc)(c3,f); + else + hold_index--; + continue; + } + if (w_iconv_combine(c1, c2, 0, c3, c4, 0)) { + w_iconv_nocombine(c1, c2, 0); + if (fromhold_count <= 2) { + (*i_ungetc)(c4,f); + (*i_ungetc)(c3,f); + } else if (fromhold_count == 3) { + (*i_ungetc)(c4,f); + hold_index--; } else { - code_status(c4); - (*iconv)(c1, c2, (c3<<8)|c4); + hold_index -= 2; } } break; @@ -3148,13 +3499,68 @@ h_conv(FILE *f, int c1, int c2) /* 3 bytes EUC or UTF-8 */ if (hold_index < hold_count){ c3 = hold_buf[hold_index++]; + fromhold_count++; } else if ((c3 = (*i_getc)(f)) == EOF) { ret = EOF; break; } else { code_status(c3); } - (*iconv)(c1, c2, c3); + if ((*iconv)(c1, c2, c3) == -3) { + /* 6 bytes UTF-8 (check combining character) */ + nkf_char c5, c6; + if (hold_index < hold_count){ + c4 = hold_buf[hold_index++]; + fromhold_count++; + } else if ((c4 = (*i_getc)(f)) == EOF) { + w_iconv_nocombine(c1, c2, c3); + continue; + } + if (hold_index < hold_count){ + c5 = hold_buf[hold_index++]; + fromhold_count++; + } else if ((c5 = (*i_getc)(f)) == EOF) { + w_iconv_nocombine(c1, c2, c3); + if (fromhold_count == 4) + hold_index--; + else + (*i_ungetc)(c4,f); + continue; + } + if (hold_index < hold_count){ + c6 = hold_buf[hold_index++]; + fromhold_count++; + } else if ((c6 = (*i_getc)(f)) == EOF) { + w_iconv_nocombine(c1, c2, c3); + if (fromhold_count == 5) { + hold_index -= 2; + } else if (fromhold_count == 4) { + hold_index--; + (*i_ungetc)(c5,f); + } else { + (*i_ungetc)(c5,f); + (*i_ungetc)(c4,f); + } + continue; + } + if (w_iconv_combine(c1, c2, c3, c4, c5, c6)) { + w_iconv_nocombine(c1, c2, c3); + if (fromhold_count == 6) { + hold_index -= 3; + } else if (fromhold_count == 5) { + hold_index -= 2; + (*i_ungetc)(c6,f); + } else if (fromhold_count == 4) { + hold_index--; + (*i_ungetc)(c6,f); + (*i_ungetc)(c5,f); + } else { + (*i_ungetc)(c6,f); + (*i_ungetc)(c5,f); + (*i_ungetc)(c4,f); + } + } + } break; } if (c3 == EOF) break; @@ -3169,6 +3575,7 @@ static void check_bom(FILE *f) { int c2; + input_bom_f = FALSE; switch(c2 = (*i_getc)(f)){ case 0x00: if((c2 = (*i_getc)(f)) == 0x00){ @@ -3178,6 +3585,7 @@ check_bom(FILE *f) set_iconv(TRUE, w_iconv32); } if (iconv == w_iconv32) { + input_bom_f = TRUE; input_endian = ENDIAN_BIG; return; } @@ -3208,6 +3616,7 @@ check_bom(FILE *f) set_iconv(TRUE, w_iconv); } if (iconv == w_iconv) { + input_bom_f = TRUE; return; } (*i_ungetc)(0xBF,f); @@ -3236,6 +3645,7 @@ check_bom(FILE *f) } if (iconv == w_iconv16) { input_endian = ENDIAN_BIG; + input_bom_f = TRUE; return; } (*i_ungetc)(0xFF,f); @@ -3251,6 +3661,7 @@ check_bom(FILE *f) } if (iconv == w_iconv32) { input_endian = ENDIAN_LITTLE; + input_bom_f = TRUE; return; } (*i_ungetc)(0x00,f); @@ -3262,6 +3673,7 @@ check_bom(FILE *f) } if (iconv == w_iconv16) { input_endian = ENDIAN_LITTLE; + input_bom_f = TRUE; return; } (*i_ungetc)(0xFE,f); @@ -3314,7 +3726,7 @@ broken_getc(FILE *f) } static nkf_char -broken_ungetc(nkf_char c, FILE *f) +broken_ungetc(nkf_char c, ARG_UNUSED FILE *f) { if (nkf_buf_length(nkf_state->broken_buf) < 2) nkf_buf_push(nkf_state->broken_buf, c); @@ -3342,6 +3754,40 @@ eol_conv(nkf_char c2, nkf_char c1) else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1); } +static void +put_newline(void (*func)(nkf_char)) +{ + switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { + case CRLF: + (*func)(0x0D); + (*func)(0x0A); + break; + case CR: + (*func)(0x0D); + break; + case LF: + (*func)(0x0A); + break; + } +} + +static void +oconv_newline(void (*func)(nkf_char, nkf_char)) +{ + switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { + case CRLF: + (*func)(0, 0x0D); + (*func)(0, 0x0A); + break; + case CR: + (*func)(0, 0x0D); + break; + case LF: + (*func)(0, 0x0A); + break; + } +} + /* Return value of fold_conv() @@ -3388,8 +3834,8 @@ fold_conv(nkf_char c2, nkf_char c1) f_prev = c1; f_line = 0; fold_state = CR; - } else if ((f_prev == c1 && !fold_preserve_f) - || (f_prev == LF && fold_preserve_f) + } else if ((f_prev == c1) + || (f_prev == LF) ) { /* duplicate newline */ if (f_line) { f_line = 0; @@ -3436,7 +3882,7 @@ fold_conv(nkf_char c2, nkf_char c1) f_prev = c1; if (c2 || c2 == JIS_X_0201_1976_K) f_prev |= 0x80; /* this is Japanese */ - f_line += char_size(c2,c1); + f_line += c2 == JIS_X_0201_1976_K ? 1: char_size(c2,c1); if (f_line<=fold_len) { /* normal case */ fold_state = 1; } else { @@ -3514,13 +3960,13 @@ fold_conv(nkf_char c2, nkf_char c1) /* terminator process */ switch(fold_state) { case LF: - OCONV_NEWLINE((*o_fconv)); + oconv_newline(o_fconv); (*o_fconv)(c2,c1); break; case 0: return; case CR: - OCONV_NEWLINE((*o_fconv)); + oconv_newline(o_fconv); break; case TAB: case SP: @@ -3555,13 +4001,17 @@ z_conv(nkf_char c2, nkf_char c1) z_prev2 = 0; (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]); return; + } else if (x0213_f && c1 == (0xdf&0x7f) && ev_x0213[(z_prev1-SP)*2]) { /* 半濁点 */ + z_prev2 = 0; + (*o_zconv)(ev_x0213[(z_prev1-SP)*2], ev_x0213[(z_prev1-SP)*2+1]); + return; } } z_prev2 = 0; (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]); } if (c2 == JIS_X_0201_1976_K) { - if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) { + if (dv[(c1-SP)*2] || ev[(c1-SP)*2] || (x0213_f && ev_x0213[(c1-SP)*2])) { /* wait for 濁点 or 半濁点 */ z_prev1 = c1; z_prev2 = c2; @@ -3669,8 +4119,8 @@ z_conv(nkf_char c2, nkf_char c1) 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000, 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00, 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00, - 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 + 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x365F, + 0x375F, 0x385F, 0x395F, 0x3A5F, 0x3E5F, 0x425F, 0x445F, 0x0000 }; if (fullwidth_to_halfwidth[c1-0x20]){ c2 = fullwidth_to_halfwidth[c1-0x20]; @@ -3680,6 +4130,10 @@ z_conv(nkf_char c2, nkf_char c1) } return; } + } else if (c2 == 0 && nkf_char_unicode_p(c1) && + ((c1&VALUE_MASK) == 0x3099 || (c1&VALUE_MASK) == 0x309A)) { /* 合成用濁点・半濁点 */ + (*o_zconv)(JIS_X_0201_1976_K, 0x5E + (c1&VALUE_MASK) - 0x3099); + return; } } (*o_zconv)(c2,c1); @@ -3807,6 +4261,7 @@ static const unsigned char *mime_pattern[] = { (const unsigned char *)"\075?ISO-8859-1?Q?", (const unsigned char *)"\075?ISO-8859-1?B?", (const unsigned char *)"\075?ISO-2022-JP?B?", + (const unsigned char *)"\075?ISO-2022-JP?B?", (const unsigned char *)"\075?ISO-2022-JP?Q?", #if defined(UTF8_INPUT_ENABLE) (const unsigned char *)"\075?UTF-8?B?", @@ -3819,7 +4274,7 @@ static const unsigned char *mime_pattern[] = { /* 該当するコードの優先度を上げるための目印 */ nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = { - e_iconv, s_iconv, 0, 0, 0, 0, + e_iconv, s_iconv, 0, 0, 0, 0, 0, #if defined(UTF8_INPUT_ENABLE) w_iconv, w_iconv, #endif @@ -3827,7 +4282,7 @@ nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = { }; static const nkf_char mime_encode[] = { - EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, + EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K, #if defined(UTF8_INPUT_ENABLE) UTF_8, UTF_8, #endif @@ -3836,7 +4291,7 @@ static const nkf_char mime_encode[] = { }; static const nkf_char mime_encode_method[] = { - 'B', 'B','Q', 'B', 'B', 'Q', + 'B', 'B','Q', 'B', 'B', 'B', 'Q', #if defined(UTF8_INPUT_ENABLE) 'B', 'Q', #endif @@ -3867,7 +4322,7 @@ mime_input_buf_unshift(nkf_char c) } static nkf_char -mime_ungetc(nkf_char c, FILE *f) +mime_ungetc(nkf_char c, ARG_UNUSED FILE *f) { mime_input_buf_unshift(c); return c; @@ -3886,7 +4341,7 @@ mime_ungetc_buf(nkf_char c, FILE *f) static nkf_char mime_getc_buf(FILE *f) { - /* we don't keep eof of mime_input_buf, becase it contains ?= as + /* we don't keep eof of mime_input_buf, because it contains ?= as a terminator. It was checked in mime_integrity. */ return ((mimebuf_f)? (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++)); @@ -4012,7 +4467,7 @@ mime_begin_strict(FILE *f) static nkf_char mime_begin(FILE *f) { - nkf_char c1; + nkf_char c1 = 0; int i,k; /* In NONSTRICT mode, only =? is checked. In case of failure, we */ @@ -4068,7 +4523,7 @@ mime_begin(FILE *f) #ifdef CHECK_OPTION static void -no_putc(nkf_char c) +no_putc(ARG_UNUSED nkf_char c) { ; } @@ -4105,7 +4560,9 @@ get_guessed_code(void) if (p->score & (SCORE_DEPEND|SCORE_CP932)) input_codename = "CP932"; } else if (strcmp(input_codename, "EUC-JP") == 0) { - if (p->score & (SCORE_X0212)) + if (p->score & SCORE_X0213) + input_codename = "EUC-JIS-2004"; + else if (p->score & (SCORE_X0212)) input_codename = "EUCJP-MS"; else if (p->score & (SCORE_DEPEND|SCORE_CP932)) input_codename = "CP51932"; @@ -4131,8 +4588,13 @@ print_guessed_code(char *filename) if (guess_f == 1) { printf("%s\n", input_codename); } else { - printf("%s%s\n", + printf("%s%s%s%s\n", input_codename, + iconv != w_iconv16 && iconv != w_iconv32 ? "" : + input_endian == ENDIAN_LITTLE ? " LE" : + input_endian == ENDIAN_BIG ? " BE" : + "[BUG]", + input_bom_f ? " (BOM)" : "", input_eol == CR ? " (CR)" : input_eol == LF ? " (LF)" : input_eol == CRLF ? " (CRLF)" : @@ -4200,7 +4662,7 @@ numchar_getc(FILE *f) nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc; int i = 0, j; nkf_char buf[12]; - long c = -1; + nkf_char c = -1; buf[i] = (*g)(f); if (buf[i] == '&'){ @@ -4430,7 +4892,7 @@ mime_getc(FILE *f) } if (c1=='='&&c20 && nkf_isspace(mimeout_state.buf[i])) { @@ -4635,14 +5097,14 @@ mime_prechar(nkf_char c2, nkf_char c1) if (c2 == EOF){ if (base64_count + mimeout_state.count/3*4> 73){ (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); + oconv_newline(o_base64conv); (*o_base64conv)(0,SP); base64_count = 1; } } else { - if (base64_count + mimeout_state.count/3*4> 66) { + if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) { (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); + oconv_newline(o_base64conv); (*o_base64conv)(0,SP); base64_count = 1; mimeout_mode = -1; @@ -4653,7 +5115,7 @@ mime_prechar(nkf_char c2, nkf_char c1) mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B'; open_mime(output_mode); (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); + oconv_newline(o_base64conv); (*o_base64conv)(0,SP); base64_count = 1; mimeout_mode = -1; @@ -4751,14 +5213,14 @@ mime_putc(nkf_char c) if (base64_count > 71){ if (c!=CR && c!=LF) { (*o_mputc)('='); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); } base64_count = 0; } }else{ if (base64_count > 71){ eof_mime(); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; } if (c == EOF) { /* c==EOF */ @@ -4820,7 +5282,7 @@ mime_putc(nkf_char c) } else if (c <= SP) { close_mime(); if (base64_count > 70) { - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; } if (!nkf_isblank(c)) { @@ -4830,7 +5292,7 @@ mime_putc(nkf_char c) } else { if (base64_count > 70) { close_mime(); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); (*o_mputc)(SP); base64_count = 1; open_mime(output_mode); @@ -4840,14 +5302,17 @@ mime_putc(nkf_char c) return; } } - (*o_mputc)(c); - base64_count++; + if (c != 0x1B) { + (*o_mputc)(c); + base64_count++; + return; + } } - return; } if (mimeout_mode <= 0) { - if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { + if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || + output_mode == UTF_8)) { if (nkf_isspace(c)) { int flag = 0; if (mimeout_mode == -1) { @@ -4885,14 +5350,14 @@ mime_putc(nkf_char c) i = 0; for (; i < mimeout_state.count - len; ++i) { - if (!strncmp(mimeout_state.buf+i, str, len)) { + if (!strncmp((char *)(mimeout_state.buf+i), str, len)) { i += len - 2; break; } } if (i == 0 || i == mimeout_state.count - len) { - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; if (!nkf_isspace(mimeout_state.buf[0])){ (*o_mputc)(SP); @@ -4904,7 +5369,7 @@ mime_putc(nkf_char c) for (j = 0; j <= i; ++j) { (*o_mputc)(mimeout_state.buf[j]); } - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 1; for (; j <= mimeout_state.count; ++j) { mimeout_state.buf[j - i] = mimeout_state.buf[j]; @@ -4938,14 +5403,15 @@ mime_putc(nkf_char c) } }else{ /* mimeout_mode == 'B', 1, 2 */ - if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { + if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || + output_mode == UTF_8)) { if (lastchar == CR || lastchar == LF){ if (nkf_isblank(c)) { for (i=0;iMIMEOUT_BUF_LENGTH) { eof_mime(); - for (i=0;i DEL) { + if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) { /* in case of 8th bit is on */ if (!estab_f&&!mime_decode_mode) { /* in case of not established yet */ - /* It is still ambiguious */ + /* It is still ambiguous */ if (h_conv(f, c2, c1)==EOF) { LAST; } @@ -5447,6 +5947,12 @@ kanji_convert(FILE *f) if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) { /* CP5022x */ MORE; + }else if (input_codename && input_codename[0] == 'I' && + 0xA1 <= c1 && c1 <= 0xDF) { + /* JIS X 0201 Katakana in 8bit JIS */ + c2 = JIS_X_0201_1976_K; + c1 &= 0x7f; + SEND; } else if (c1 > DEL) { /* 8 bit code */ if (!estab_f && !iso8859_f) { @@ -5521,7 +6027,7 @@ kanji_convert(FILE *f) SKIP; } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) { if ((c1 = (*i_getc)(f)) == EOF) { - /* (*oconv)(0, ESC); don't send bogus code */ + (*oconv)(0, ESC); LAST; } else if (c1 == '&') { @@ -5597,6 +6103,7 @@ kanji_convert(FILE *f) else if (c1 == 'I') { /* JIS X 0201 Katakana */ set_input_mode(JIS_X_0201_1976_K); + shift_mode = 1; SKIP; } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') { @@ -5644,14 +6151,15 @@ kanji_convert(FILE *f) } } else { + i_ungetc(c1,f); /* lonely ESC */ (*oconv)(0, ESC); - SEND; + SKIP; } } else if (c1 == ESC && iconv == s_iconv) { /* ESC in Shift_JIS */ if ((c1 = (*i_getc)(f)) == EOF) { - /* (*oconv)(0, ESC); don't send bogus code */ + (*oconv)(0, ESC); LAST; } else if (c1 == '$') { /* J-PHONE emoji */ @@ -5683,9 +6191,10 @@ kanji_convert(FILE *f) } } else { + i_ungetc(c1,f); /* lonely ESC */ (*oconv)(0, ESC); - SEND; + SKIP; } } else if (c1 == LF || c1 == CR) { if (broken_f&4) { @@ -5738,11 +6247,52 @@ kanji_convert(FILE *f) } } break; + case -3: + /* 4 bytes UTF-8 (check combining character) */ + if ((c3 = (*i_getc)(f)) != EOF) { + if ((c4 = (*i_getc)(f)) != EOF) { + if (w_iconv_combine(c2, c1, 0, c3, c4, 0)) { + (*i_ungetc)(c4, f); + (*i_ungetc)(c3, f); + w_iconv_nocombine(c2, c1, 0); + } + } else { + (*i_ungetc)(c3, f); + w_iconv_nocombine(c2, c1, 0); + } + } else { + w_iconv_nocombine(c2, c1, 0); + } + break; case -1: /* 3 bytes EUC or UTF-8 */ if ((c3 = (*i_getc)(f)) != EOF) { code_status(c3); - (*iconv)(c2, c1, c3); + if ((*iconv)(c2, c1, c3) == -3) { + /* 6 bytes UTF-8 (check combining character) */ + nkf_char c5, c6; + if ((c4 = (*i_getc)(f)) != EOF) { + if ((c5 = (*i_getc)(f)) != EOF) { + if ((c6 = (*i_getc)(f)) != EOF) { + if (w_iconv_combine(c2, c1, c3, c4, c5, c6)) { + (*i_ungetc)(c6, f); + (*i_ungetc)(c5, f); + (*i_ungetc)(c4, f); + w_iconv_nocombine(c2, c1, c3); + } + } else { + (*i_ungetc)(c5, f); + (*i_ungetc)(c4, f); + w_iconv_nocombine(c2, c1, c3); + } + } else { + (*i_ungetc)(c4, f); + w_iconv_nocombine(c2, c1, c3); + } + } else { + w_iconv_nocombine(c2, c1, c3); + } + } } break; } @@ -5776,6 +6326,7 @@ kanji_convert(FILE *f) /* goto next_word */ } +finished: /* epilogue */ (*iconv)(EOF, 0, 0); if (!input_codename) @@ -5798,7 +6349,7 @@ kanji_convert(FILE *f) /* * int options(unsigned char *cp) - * + * * return values: * 0: success * -1: ArgumentError @@ -5827,7 +6378,7 @@ options(unsigned char *cp) option_mode = 1; return 0; } - for (i=0;iX0208 conversion */ @@ -6347,7 +6899,7 @@ options(unsigned char *cp) continue; #endif case SP: - /* module muliple options in a string are allowed for Perl moudle */ + /* module multiple options in a string are allowed for Perl module */ while(*cp && *cp++!='-'); continue; default: