X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=nkf.c;h=6877afe2374925c01f6ea480989933506c006daa;hb=71012057801fe0cdcf0ab9a136c7259e5409ca0e;hp=d587aac3f94db1ad1b1a7b05639a09c8618a96ae;hpb=c12280757bfb275d6f9e6b0bf6293a28b060e77b;p=nkf%2Fnkf.git diff --git a/nkf.c b/nkf.c index d587aac..6877afe 100644 --- a/nkf.c +++ b/nkf.c @@ -1,30 +1,30 @@ /* * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA). - * Copyright (c) 1996-2009, The nkf Project. + * Copyright (c) 1996-2010, The nkf Project. * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * + * 3. This notice may not be removed or altered from any source distribution. */ -#define NKF_VERSION "2.0.8" -#define NKF_RELEASE_DATE "2009-01-05" +#define NKF_VERSION "2.1.1" +#define NKF_RELEASE_DATE "2010-08-08" #define COPY_RIGHT \ "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \ - "Copyright (C) 1996-2009, The nkf Project." + "Copyright (C) 1996-2010, The nkf Project." #include "config.h" #include "nkf.h" @@ -210,6 +210,8 @@ struct { } encoding_name_to_id_table[] = { {"US-ASCII", ASCII}, {"ASCII", ASCII}, + {"646", ASCII}, + {"ROMAN8", ASCII}, {"ISO-2022-JP", ISO_2022_JP}, {"ISO2022JP-CP932", CP50220}, {"CP50220", CP50220}, @@ -221,6 +223,8 @@ struct { {"ISO-2022-JP-2004", ISO_2022_JP_2004}, {"SHIFT_JIS", SHIFT_JIS}, {"SJIS", SHIFT_JIS}, + {"MS_Kanji", SHIFT_JIS}, + {"PCK", SHIFT_JIS}, {"WINDOWS-31J", WINDOWS_31J}, {"CSWINDOWS31J", WINDOWS_31J}, {"CP932", WINDOWS_31J}, @@ -295,7 +299,7 @@ struct { && (c != '(') && (c != ')') && (c != '.') && (c != 0x22))) #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END) -#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F)) +#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F) #define HOLD_SIZE 1024 #if defined(INT_IS_SHORT) @@ -379,6 +383,8 @@ static unsigned char stdibuf[IOBUF_SIZE]; static unsigned char stdobuf[IOBUF_SIZE]; #endif +#define NKF_UNSPECIFIED (-TRUE) + /* flags */ static int unbuf_f = FALSE; static int estab_f = FALSE; @@ -393,7 +399,7 @@ static int mimebuf_f = FALSE; /* MIME buffered input */ static int broken_f = FALSE; /* convert ESC-less broken JIS */ static int iso8859_f = FALSE; /* ISO8859 through */ static int mimeout_f = FALSE; /* base64 mode */ -static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */ +static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */ static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */ #ifdef UNICODE_NORMALIZATION @@ -468,8 +474,10 @@ struct input_code input_code_list[] = { {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0}, #ifdef UTF8_INPUT_ENABLE {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0}, + {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0}, + {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0}, #endif - {0} + {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0} }; static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */ @@ -501,7 +509,7 @@ static nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0) { fprintf(stderr,"nkf internal module connection failure.\n"); - exit(1); + exit(EXIT_FAILURE); return 0; /* LINT */ } @@ -649,10 +657,6 @@ static nkf_char prev_cr = 0; /* CR or 0 */ static int end_check; #endif /*Easy Win */ -#define STD_GC_BUFSIZE (256) -nkf_char std_gc_buf[STD_GC_BUFSIZE]; -nkf_char std_gc_ndx; - static void * nkf_xmalloc(size_t size) { @@ -806,30 +810,32 @@ nkf_default_encoding() typedef struct { long capa; long len; - unsigned char *ptr; + nkf_char *ptr; } nkf_buf_t; static nkf_buf_t * nkf_buf_new(int length) { nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t)); - buf->ptr = nkf_xmalloc(length); + buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length); buf->capa = length; buf->len = 0; return buf; } +#if 0 static void nkf_buf_dispose(nkf_buf_t *buf) { nkf_xfree(buf->ptr); nkf_xfree(buf); } +#endif #define nkf_buf_length(buf) ((buf)->len) #define nkf_buf_empty_p(buf) ((buf)->len == 0) -static unsigned char +static nkf_char nkf_buf_at(nkf_buf_t *buf, int index) { assert(index <= buf->len); @@ -839,17 +845,19 @@ nkf_buf_at(nkf_buf_t *buf, int index) static void nkf_buf_clear(nkf_buf_t *buf) { - buf->ptr = 0; + buf->len = 0; } static void -nkf_buf_push(nkf_buf_t *buf, unsigned char c) +nkf_buf_push(nkf_buf_t *buf, nkf_char c) { - assert(buf->capa > buf->len); + if (buf->capa <= buf->len) { + exit(EXIT_FAILURE); + } buf->ptr[buf->len++] = c; } -static unsigned char +static nkf_char nkf_buf_pop(nkf_buf_t *buf) { assert(!nkf_buf_empty_p(buf)); @@ -872,79 +880,61 @@ static void usage(void) { fprintf(HELP_OUTPUT, - "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n" - "Flags:\n" - "b,u Output is buffered (DEFAULT),Output is unbuffered\n" - "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n" + "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n" #ifdef UTF8_OUTPUT_ENABLE - " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n" + " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" + " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n" +#else #endif - "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n" #ifdef UTF8_INPUT_ENABLE - " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n" + " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" + " UTF option is -W[8,[16,32][B,L]]\n" +#else + " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" #endif - "t no conversion\n" ); fprintf(HELP_OUTPUT, - "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n" - "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n" - "r {de/en}crypt ROT13/47\n" - "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n" - "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n" - "M[BQ] MIME encode [B:base64 Q:quoted]\n" - "l ISO8859-1 (Latin-1) support\n" - "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n" + " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n" + " M[BQ] MIME encode [B:base64 Q:quoted]\n" + " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n" ); fprintf(HELP_OUTPUT, - "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n" - " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n" - " 4: JISX0208 Katakana to JISX0201 Katakana\n" - "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n" - "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n" + " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n" + " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n" + " 4: JISX0208 Katakana to JISX0201 Katakana\n" + " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n" ); fprintf(HELP_OUTPUT, -#ifdef MSDOS - "T Text mode output\n" -#endif - "O Output to File (DEFAULT 'nkf.out')\n" - "I Convert non ISO-2022-JP charactor to GETA\n" - "d,c Convert line breaks -d: LF -c: CRLF\n" - "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n" - "v, V Show this usage. V: show configuration\n" - "\n"); + " O Output to File (DEFAULT 'nkf.out')\n" + " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n" + ); fprintf(HELP_OUTPUT, - "Long name options\n" - " --ic= --oc=\n" - " Specify the input or output codeset\n" - " --fj --unix --mac --windows\n" - " --jis --euc --sjis --utf8 --utf16 --mime --base64\n" - " Convert for the system or code\n" - " --hiragana --katakana --katakana-hiragana\n" - " To Hiragana/Katakana Conversion\n" - " --prefix= Insert escape before troublesome characters of Shift_JIS\n" + " --ic= Specify the input encoding\n" + " --oc= Specify the output encoding\n" + " --hiragana --katakana Hiragana/Katakana Conversion\n" + " --katakana-hiragana Converts each other\n" ); fprintf(HELP_OUTPUT, #ifdef INPUT_OPTION - " --cap-input, --url-input Convert hex after ':' or '%%'\n" + " --{cap, url}-input Convert hex after ':' or '%%'\n" #endif #ifdef NUMCHAR_OPTION - " --numchar-input Convert Unicode Character Reference\n" + " --numchar-input Convert Unicode Character Reference\n" #endif #ifdef UTF8_INPUT_ENABLE " --fb-{skip, html, xml, perl, java, subchar}\n" - " Specify how nkf handles unassigned characters\n" + " Specify unassigned character's replacement\n" #endif ); fprintf(HELP_OUTPUT, #ifdef OVERWRITE - " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n" - " Overwrite original listed files by filtered result\n" - " --overwrite preserves timestamp of original files\n" + " --in-place[=SUF] Overwrite original files\n" + " --overwrite[=SUF] Preserve timestamp of original files\n" #endif - " -g --guess Guess the input code\n" - " --help --version Show this help/the version\n" - " For more information, see also man nkf\n" - "\n"); + " -g --guess Guess the input code\n" + " -v --version Print the version\n" + " --help/-V Print this help / configuration\n" + ); version(); } @@ -1044,7 +1034,7 @@ nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c) int shift = 20; c &= VALUE_MASK; while(shift >= 0){ - if(c >= 1<= NKF_INT32_C(1)<= 0){ (*f)(0, bin2hex(c>>shift)); shift -= 4; @@ -1142,7 +1132,7 @@ static const struct { {"euc","e"}, {"euc-input","E"}, {"fj","jm"}, - {"help","v"}, + {"help",""}, {"jis","j"}, {"jis-input","J"}, {"mac","sLm"}, @@ -1152,7 +1142,7 @@ static const struct { {"sjis","s"}, {"sjis-input","S"}, {"unix","eLu"}, - {"version","V"}, + {"version","v"}, {"windows","sLw"}, {"hiragana","h1"}, {"katakana","h2"}, @@ -1219,9 +1209,10 @@ set_input_encoding(nkf_encoding *enc) case ISO_8859_1: iso8859_f = TRUE; break; - case CP50220: case CP50221: case CP50222: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ + case CP50220: #ifdef SHIFTJIS_CP932 cp51932_f = TRUE; #endif @@ -1243,6 +1234,7 @@ set_input_encoding(nkf_encoding *enc) case SHIFT_JIS: break; case WINDOWS_31J: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = TRUE; #endif @@ -1264,6 +1256,7 @@ set_input_encoding(nkf_encoding *enc) case EUCJP_NKF: break; case CP51932: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = TRUE; #endif @@ -1272,6 +1265,7 @@ set_input_encoding(nkf_encoding *enc) #endif break; case EUCJP_MS: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = FALSE; #endif @@ -1280,6 +1274,7 @@ set_input_encoding(nkf_encoding *enc) #endif break; case EUCJP_ASCII: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = FALSE; #endif @@ -1334,7 +1329,6 @@ set_output_encoding(nkf_encoding *enc) { switch (nkf_enc_to_index(enc)) { case CP50220: - x0201_f = TRUE; #ifdef SHIFTJIS_CP932 if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif @@ -1343,6 +1337,7 @@ set_output_encoding(nkf_encoding *enc) #endif break; case CP50221: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif @@ -1350,6 +1345,11 @@ set_output_encoding(nkf_encoding *enc) ms_ucs_map_f = UCS_MAP_CP932; #endif break; + case ISO_2022_JP: +#ifdef SHIFTJIS_CP932 + if (cp932inv_f == TRUE) cp932inv_f = FALSE; +#endif + break; case ISO_2022_JP_1: x0212_f = TRUE; #ifdef SHIFTJIS_CP932 @@ -1366,6 +1366,7 @@ set_output_encoding(nkf_encoding *enc) case SHIFT_JIS: break; case WINDOWS_31J: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef UTF8_OUTPUT_ENABLE ms_ucs_map_f = UCS_MAP_CP932; #endif @@ -1394,6 +1395,7 @@ set_output_encoding(nkf_encoding *enc) #endif break; case CP51932: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif @@ -1402,12 +1404,14 @@ set_output_encoding(nkf_encoding *enc) #endif break; case EUCJP_MS: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ x0212_f = TRUE; #ifdef UTF8_OUTPUT_ENABLE ms_ucs_map_f = UCS_MAP_MS; #endif break; case EUCJP_ASCII: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ x0212_f = TRUE; #ifdef UTF8_OUTPUT_ENABLE ms_ucs_map_f = UCS_MAP_ASCII; @@ -1444,6 +1448,7 @@ set_output_encoding(nkf_encoding *enc) output_endian = ENDIAN_LITTLE; output_bom_f = TRUE; break; + case UTF_32: case UTF_32BE_BOM: output_bom_f = TRUE; break; @@ -1670,7 +1675,7 @@ nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_ *p3 = 0x80 | ( val & 0x3f); *p4 = 0; } else if (nkf_char_unicode_value_p(val)) { - *p1 = 0xe0 | (val >> 16); + *p1 = 0xf0 | (val >> 18); *p2 = 0x80 | ((val >> 12) & 0x3f); *p3 = 0x80 | ((val >> 6) & 0x3f); *p4 = 0x80 | ( val & 0x3f); @@ -2197,8 +2202,8 @@ unicode_iconv(nkf_char wc) return 0; } -#define NKF_ICONV_NEED_ONE_MORE_BYTE -1 -#define NKF_ICONV_NEED_TWO_MORE_BYTES -2 +#define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1 +#define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00)) static size_t nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) @@ -2234,13 +2239,15 @@ nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0) { - return 0; + (*oconv)(c2, c1); + return 16; /* different from w_iconv32 */ } static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0) { - return 0; + (*oconv)(c2, c1); + return 32; /* different from w_iconv16 */ } static size_t @@ -2997,12 +3004,42 @@ code_status(nkf_char c) } } +typedef struct { + nkf_buf_t *std_gc_buf; + nkf_char broken_state; + nkf_buf_t *broken_buf; + nkf_char mimeout_state; + nkf_buf_t *nfc_buf; +} nkf_state_t; + +static nkf_state_t *nkf_state = NULL; + +#define STD_GC_BUFSIZE (256) + +static void +nkf_state_init(void) +{ + if (nkf_state) { + nkf_buf_clear(nkf_state->std_gc_buf); + nkf_buf_clear(nkf_state->broken_buf); + nkf_buf_clear(nkf_state->nfc_buf); + } + else { + nkf_state = nkf_xmalloc(sizeof(nkf_state_t)); + nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE); + nkf_state->broken_buf = nkf_buf_new(3); + nkf_state->nfc_buf = nkf_buf_new(9); + } + nkf_state->broken_state = 0; + nkf_state->mimeout_state = 0; +} + #ifndef WIN32DLL static nkf_char std_getc(FILE *f) { - if (std_gc_ndx){ - return std_gc_buf[--std_gc_ndx]; + if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){ + return nkf_buf_pop(nkf_state->std_gc_buf); } return getc(f); } @@ -3011,10 +3048,7 @@ std_getc(FILE *f) static nkf_char std_ungetc(nkf_char c, FILE *f) { - if (std_gc_ndx == STD_GC_BUFSIZE){ - return EOF; - } - std_gc_buf[std_gc_ndx++] = c; + nkf_buf_push(nkf_state->std_gc_buf, c); return c; } @@ -3027,23 +3061,23 @@ std_putc(nkf_char c) } #endif /*WIN32DLL*/ -static unsigned char hold_buf[HOLD_SIZE*2]; +static nkf_char hold_buf[HOLD_SIZE*2]; static int hold_count = 0; static nkf_char push_hold_buf(nkf_char c2) { if (hold_count >= HOLD_SIZE*2) return (EOF); - hold_buf[hold_count++] = (unsigned char)c2; + hold_buf[hold_count++] = c2; return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); } static int -h_conv(FILE *f, int c1, int c2) +h_conv(FILE *f, nkf_char c1, nkf_char c2) { - int ret, c4, c3; + int ret; int hold_index; - + nkf_char c3, c4; /** it must NOT be in the kanji shifte sequence */ /** it must NOT be written in JIS7 */ @@ -3093,7 +3127,11 @@ h_conv(FILE *f, int c1, int c2) hold_index = 0; while (hold_index < hold_count){ c1 = hold_buf[hold_index++]; - if (c1 <= DEL){ + if (nkf_char_unicode_p(c1)) { + (*oconv)(0, c1); + continue; + } + else if (c1 <= DEL){ (*iconv)(0, c1, 0); continue; }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){ @@ -3119,18 +3157,16 @@ h_conv(FILE *f, int c1, int c2) } else if ((c3 = (*i_getc)(f)) == EOF) { ret = EOF; break; - } else { - code_status(c3); - if (hold_index < hold_count){ - c4 = hold_buf[hold_index++]; - } else if ((c4 = (*i_getc)(f)) == EOF) { - c3 = ret = EOF; - break; - } else { - code_status(c4); - (*iconv)(c1, c2, (c3<<8)|c4); - } } + code_status(c3); + if (hold_index < hold_count){ + c4 = hold_buf[hold_index++]; + } else if ((c4 = (*i_getc)(f)) == EOF) { + c3 = ret = EOF; + break; + } + code_status(c4); + (*iconv)(c1, c2, (c3<<8)|c4); break; case -1: /* 3 bytes EUC or UTF-8 */ @@ -3262,65 +3298,41 @@ check_bom(FILE *f) } } -static struct { - int count; - nkf_char status; - nkf_char buf[3]; -} broken_state; - -static void -init_broken_state(void) -{ - memset(&broken_state, 0, sizeof(broken_state)); -} - -static void -push_broken_buf(c) -{ - broken_state.buf[broken_state.count++] = c; -} - -static nkf_char -pop_broken_buf(void) -{ - return broken_state.buf[--broken_state.count]; -} - static nkf_char broken_getc(FILE *f) { nkf_char c, c1; - if (broken_state.count > 0) { - return pop_broken_buf(); + if (!nkf_buf_empty_p(nkf_state->broken_buf)) { + return nkf_buf_pop(nkf_state->broken_buf); } c = (*i_bgetc)(f); - if (c=='$' && broken_state.status != ESC + if (c=='$' && nkf_state->broken_state != ESC && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) { c1= (*i_bgetc)(f); - broken_state.status = 0; + nkf_state->broken_state = 0; if (c1=='@'|| c1=='B') { - push_broken_buf(c1); - push_broken_buf(c); + nkf_buf_push(nkf_state->broken_buf, c1); + nkf_buf_push(nkf_state->broken_buf, c); return ESC; } else { (*i_bungetc)(c1,f); return c; } - } else if (c=='(' && broken_state.status != ESC + } else if (c=='(' && nkf_state->broken_state != ESC && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) { c1= (*i_bgetc)(f); - broken_state.status = 0; + nkf_state->broken_state = 0; if (c1=='J'|| c1=='B') { - push_broken_buf(c1); - push_broken_buf(c); + nkf_buf_push(nkf_state->broken_buf, c1); + nkf_buf_push(nkf_state->broken_buf, c); return ESC; } else { (*i_bungetc)(c1,f); return c; } } else { - broken_state.status = c; + nkf_state->broken_state = c; return c; } } @@ -3328,8 +3340,8 @@ broken_getc(FILE *f) static nkf_char broken_ungetc(nkf_char c, FILE *f) { - if (broken_state.count < 2) - push_broken_buf(c); + if (nkf_buf_length(nkf_state->broken_buf) < 2) + nkf_buf_push(nkf_state->broken_buf, c); return c; } @@ -3354,6 +3366,40 @@ eol_conv(nkf_char c2, nkf_char c1) else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1); } +static void +put_newline(void (*func)(nkf_char)) +{ + switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { + case CRLF: + (*func)(0x0D); + (*func)(0x0A); + break; + case CR: + (*func)(0x0D); + break; + case LF: + (*func)(0x0A); + break; + } +} + +static void +oconv_newline(void (*func)(nkf_char, nkf_char)) +{ + switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { + case CRLF: + (*func)(0, 0x0D); + (*func)(0, 0x0A); + break; + case CR: + (*func)(0, 0x0D); + break; + case LF: + (*func)(0, 0x0A); + break; + } +} + /* Return value of fold_conv() @@ -3430,9 +3476,7 @@ fold_conv(nkf_char c2, nkf_char c1) f_prev = LF; f_line = 0; fold_state = LF; /* output newline and clear */ - } else if ( (c2==0 && c1==SP)|| - (c2==0 && c1==TAB)|| - (c2=='!'&& c1=='!')) { + } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) { /* X0208 kankaku or ascii space */ if (f_prev == SP) { fold_state = 0; /* remove duplicate spaces */ @@ -3528,13 +3572,13 @@ fold_conv(nkf_char c2, nkf_char c1) /* terminator process */ switch(fold_state) { case LF: - OCONV_NEWLINE((*o_fconv)); + oconv_newline(o_fconv); (*o_fconv)(c2,c1); break; case 0: return; case CR: - OCONV_NEWLINE((*o_fconv)); + oconv_newline(o_fconv); break; case TAB: case SP: @@ -3821,6 +3865,7 @@ static const unsigned char *mime_pattern[] = { (const unsigned char *)"\075?ISO-8859-1?Q?", (const unsigned char *)"\075?ISO-8859-1?B?", (const unsigned char *)"\075?ISO-2022-JP?B?", + (const unsigned char *)"\075?ISO-2022-JP?B?", (const unsigned char *)"\075?ISO-2022-JP?Q?", #if defined(UTF8_INPUT_ENABLE) (const unsigned char *)"\075?UTF-8?B?", @@ -3841,7 +3886,7 @@ nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = { }; static const nkf_char mime_encode[] = { - EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, + EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K, #if defined(UTF8_INPUT_ENABLE) UTF_8, UTF_8, #endif @@ -3850,7 +3895,7 @@ static const nkf_char mime_encode[] = { }; static const nkf_char mime_encode_method[] = { - 'B', 'B','Q', 'B', 'B', 'Q', + 'B', 'B','Q', 'B', 'B', 'B', 'Q', #if defined(UTF8_INPUT_ENABLE) 'B', 'Q', #endif @@ -4275,14 +4320,14 @@ nfc_getc(FILE *f) { nkf_char (*g)(FILE *f) = i_nfc_getc; nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc; - nkf_buf_t *buf = nkf_buf_new(9); + nkf_buf_t *buf = nkf_state->nfc_buf; const unsigned char *array; int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1; nkf_char c = (*g)(f); if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c; - nkf_buf_push(buf, (unsigned char)c); + nkf_buf_push(buf, c); do { while (lower <= upper) { int mid = (lower+upper) / 2; @@ -4318,7 +4363,6 @@ nfc_getc(FILE *f) while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f); c = nkf_buf_pop(buf); - nkf_buf_dispose(buf); return c; } @@ -4400,7 +4444,7 @@ mime_getc(FILE *f) case LF: case CR: if (c1==LF) { - if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) { + if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { i_ungetc(SP,f); continue; } else { @@ -4409,7 +4453,7 @@ mime_getc(FILE *f) c1 = LF; } else { if ((c1=(*i_getc)(f))!=EOF && c1 == LF) { - if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) { + if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { i_ungetc(SP,f); continue; } else { @@ -4445,7 +4489,7 @@ mime_getc(FILE *f) } if (c1=='='&&c20 - && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB - || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) { + if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) { i++; } } for (;i 73){ (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); + oconv_newline(o_base64conv); (*o_base64conv)(0,SP); base64_count = 1; } } else { - if (base64_count + mimeout_state.count/3*4> 66) { + if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) { (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); + oconv_newline(o_base64conv); (*o_base64conv)(0,SP); base64_count = 1; mimeout_mode = -1; @@ -4672,7 +4712,7 @@ mime_prechar(nkf_char c2, nkf_char c1) mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B'; open_mime(output_mode); (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); + oconv_newline(o_base64conv); (*o_base64conv)(0,SP); base64_count = 1; mimeout_mode = -1; @@ -4697,13 +4737,13 @@ eof_mime(void) case 'B': break; case 2: - (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]); + (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]); (*o_mputc)('='); (*o_mputc)('='); base64_count += 3; break; case 1: - (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]); + (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]); (*o_mputc)('='); base64_count += 2; break; @@ -4735,19 +4775,19 @@ mimeout_addchar(nkf_char c) } break; case 'B': - mimeout_state.state=c; + nkf_state->mimeout_state=c; (*o_mputc)(basis_64[c>>2]); mimeout_mode=2; base64_count ++; break; case 2: - (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]); - mimeout_state.state=c; + (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]); + nkf_state->mimeout_state=c; mimeout_mode=1; base64_count ++; break; case 1: - (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]); + (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]); (*o_mputc)(basis_64[c & 0x3F]); mimeout_mode='B'; base64_count += 2; @@ -4770,14 +4810,14 @@ mime_putc(nkf_char c) if (base64_count > 71){ if (c!=CR && c!=LF) { (*o_mputc)('='); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); } base64_count = 0; } }else{ if (base64_count > 71){ eof_mime(); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; } if (c == EOF) { /* c==EOF */ @@ -4839,7 +4879,7 @@ mime_putc(nkf_char c) } else if (c <= SP) { close_mime(); if (base64_count > 70) { - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; } if (!nkf_isblank(c)) { @@ -4849,7 +4889,7 @@ mime_putc(nkf_char c) } else { if (base64_count > 70) { close_mime(); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); (*o_mputc)(SP); base64_count = 1; open_mime(output_mode); @@ -4859,14 +4899,17 @@ mime_putc(nkf_char c) return; } } - (*o_mputc)(c); - base64_count++; + if (c != 0x1B) { + (*o_mputc)(c); + base64_count++; + return; + } } - return; } if (mimeout_mode <= 0) { - if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { + if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || + output_mode == UTF_8)) { if (nkf_isspace(c)) { int flag = 0; if (mimeout_mode == -1) { @@ -4904,14 +4947,14 @@ mime_putc(nkf_char c) i = 0; for (; i < mimeout_state.count - len; ++i) { - if (!strncmp(mimeout_state.buf+i, str, len)) { + if (!strncmp((char *)(mimeout_state.buf+i), str, len)) { i += len - 2; break; } } if (i == 0 || i == mimeout_state.count - len) { - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; if (!nkf_isspace(mimeout_state.buf[0])){ (*o_mputc)(SP); @@ -4923,7 +4966,7 @@ mime_putc(nkf_char c) for (j = 0; j <= i; ++j) { (*o_mputc)(mimeout_state.buf[j]); } - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 1; for (; j <= mimeout_state.count; ++j) { mimeout_state.buf[j - i] = mimeout_state.buf[j]; @@ -4957,14 +5000,15 @@ mime_putc(nkf_char c) } }else{ /* mimeout_mode == 'B', 1, 2 */ - if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { + if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || + output_mode == UTF_8)) { if (lastchar == CR || lastchar == LF){ if (nkf_isblank(c)) { for (i=0;i DEL) { /* 8 bit code */ if (!estab_f && !iso8859_f) { @@ -5540,7 +5596,7 @@ kanji_convert(FILE *f) SKIP; } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) { if ((c1 = (*i_getc)(f)) == EOF) { - /* (*oconv)(0, ESC); don't send bogus code */ + (*oconv)(0, ESC); LAST; } else if (c1 == '&') { @@ -5670,7 +5726,7 @@ kanji_convert(FILE *f) } else if (c1 == ESC && iconv == s_iconv) { /* ESC in Shift_JIS */ if ((c1 = (*i_getc)(f)) == EOF) { - /* (*oconv)(0, ESC); don't send bogus code */ + (*oconv)(0, ESC); LAST; } else if (c1 == '$') { /* J-PHONE emoji */ @@ -5795,6 +5851,7 @@ kanji_convert(FILE *f) /* goto next_word */ } +finished: /* epilogue */ (*iconv)(EOF, 0, 0); if (!input_codename) @@ -5846,7 +5903,7 @@ options(unsigned char *cp) option_mode = 1; return 0; } - for (i=0;i 0) && **argv == '-'; argc--, argv++) { cp = (unsigned char *)*argv; options(cp);