X-Git-Url: http://git.sourceforge.jp/view?p=nkf%2Fnkf.git;a=blobdiff_plain;f=nkf.c;h=a8bfc9a58b73d7dfe25dffd332ecc01c13b63da2;hp=fba16bf004a61337eb88cabf9914891029e3c680;hb=5d213c263b33494a0af1e45cf3414151f0daaf2e;hpb=4b48f3bb3f13d60c87f097561e6dbc68b118e880 diff --git a/nkf.c b/nkf.c index fba16bf..a8bfc9a 100644 --- a/nkf.c +++ b/nkf.c @@ -1,30 +1,30 @@ /* * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA). - * Copyright (c) 1996-2009, The nkf Project. + * Copyright (c) 1996-2010, The nkf Project. * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * + * 3. This notice may not be removed or altered from any source distribution. */ -#define NKF_VERSION "2.0.8" -#define NKF_RELEASE_DATE "2009-01-05" +#define NKF_VERSION "2.1.1" +#define NKF_RELEASE_DATE "2010-04-14" #define COPY_RIGHT \ "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \ - "Copyright (C) 1996-2009, The nkf Project." + "Copyright (C) 1996-2010, The nkf Project." #include "config.h" #include "nkf.h" @@ -210,6 +210,8 @@ struct { } encoding_name_to_id_table[] = { {"US-ASCII", ASCII}, {"ASCII", ASCII}, + {"646", ASCII}, + {"ROMAN8", ASCII}, {"ISO-2022-JP", ISO_2022_JP}, {"ISO2022JP-CP932", CP50220}, {"CP50220", CP50220}, @@ -221,6 +223,7 @@ struct { {"ISO-2022-JP-2004", ISO_2022_JP_2004}, {"SHIFT_JIS", SHIFT_JIS}, {"SJIS", SHIFT_JIS}, + {"PCK", SHIFT_JIS}, {"WINDOWS-31J", WINDOWS_31J}, {"CSWINDOWS31J", WINDOWS_31J}, {"CP932", WINDOWS_31J}, @@ -295,7 +298,7 @@ struct { && (c != '(') && (c != ')') && (c != '.') && (c != 0x22))) #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END) -#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F)) +#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F) #define HOLD_SIZE 1024 #if defined(INT_IS_SHORT) @@ -379,6 +382,8 @@ static unsigned char stdibuf[IOBUF_SIZE]; static unsigned char stdobuf[IOBUF_SIZE]; #endif +#define NKF_UNSPECIFIED (-TRUE) + /* flags */ static int unbuf_f = FALSE; static int estab_f = FALSE; @@ -393,7 +398,7 @@ static int mimebuf_f = FALSE; /* MIME buffered input */ static int broken_f = FALSE; /* convert ESC-less broken JIS */ static int iso8859_f = FALSE; /* ISO8859 through */ static int mimeout_f = FALSE; /* base64 mode */ -static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */ +static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */ static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */ #ifdef UNICODE_NORMALIZATION @@ -468,6 +473,8 @@ struct input_code input_code_list[] = { {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0}, #ifdef UTF8_INPUT_ENABLE {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0}, + {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0}, + {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0}, #endif {0} }; @@ -501,7 +508,7 @@ static nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0) { fprintf(stderr,"nkf internal module connection failure.\n"); - exit(1); + exit(EXIT_FAILURE); return 0; /* LINT */ } @@ -802,30 +809,32 @@ nkf_default_encoding() typedef struct { long capa; long len; - unsigned char *ptr; + nkf_char *ptr; } nkf_buf_t; static nkf_buf_t * nkf_buf_new(int length) { nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t)); - buf->ptr = nkf_xmalloc(length); + buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length); buf->capa = length; buf->len = 0; return buf; } +#if 0 static void nkf_buf_dispose(nkf_buf_t *buf) { nkf_xfree(buf->ptr); nkf_xfree(buf); } +#endif #define nkf_buf_length(buf) ((buf)->len) #define nkf_buf_empty_p(buf) ((buf)->len == 0) -static unsigned char +static nkf_char nkf_buf_at(nkf_buf_t *buf, int index) { assert(index <= buf->len); @@ -839,7 +848,7 @@ nkf_buf_clear(nkf_buf_t *buf) } static void -nkf_buf_push(nkf_buf_t *buf, unsigned char c) +nkf_buf_push(nkf_buf_t *buf, nkf_char c) { if (buf->capa <= buf->len) { exit(EXIT_FAILURE); @@ -847,7 +856,7 @@ nkf_buf_push(nkf_buf_t *buf, unsigned char c) buf->ptr[buf->len++] = c; } -static unsigned char +static nkf_char nkf_buf_pop(nkf_buf_t *buf) { assert(!nkf_buf_empty_p(buf)); @@ -870,79 +879,61 @@ static void usage(void) { fprintf(HELP_OUTPUT, - "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n" - "Flags:\n" - "b,u Output is buffered (DEFAULT),Output is unbuffered\n" - "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n" + "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n" #ifdef UTF8_OUTPUT_ENABLE - " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n" + " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" + " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n" +#else #endif - "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n" #ifdef UTF8_INPUT_ENABLE - " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n" + " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" + " UTF option is -W[8,[16,32][B,L]]\n" +#else + " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" #endif - "t no conversion\n" ); fprintf(HELP_OUTPUT, - "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n" - "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n" - "r {de/en}crypt ROT13/47\n" - "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n" - "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n" - "M[BQ] MIME encode [B:base64 Q:quoted]\n" - "l ISO8859-1 (Latin-1) support\n" - "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n" + " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n" + " M[BQ] MIME encode [B:base64 Q:quoted]\n" + " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n" ); fprintf(HELP_OUTPUT, - "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n" - " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n" - " 4: JISX0208 Katakana to JISX0201 Katakana\n" - "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n" - "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n" + " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n" + " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n" + " 4: JISX0208 Katakana to JISX0201 Katakana\n" + " X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n" ); fprintf(HELP_OUTPUT, -#ifdef MSDOS - "T Text mode output\n" -#endif - "O Output to File (DEFAULT 'nkf.out')\n" - "I Convert non ISO-2022-JP charactor to GETA\n" - "d,c Convert line breaks -d: LF -c: CRLF\n" - "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n" - "v, V Show this usage. V: show configuration\n" - "\n"); + " O Output to File (DEFAULT 'nkf.out')\n" + " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n" + ); fprintf(HELP_OUTPUT, - "Long name options\n" - " --ic= --oc=\n" - " Specify the input or output codeset\n" - " --fj --unix --mac --windows\n" - " --jis --euc --sjis --utf8 --utf16 --mime --base64\n" - " Convert for the system or code\n" - " --hiragana --katakana --katakana-hiragana\n" - " To Hiragana/Katakana Conversion\n" - " --prefix= Insert escape before troublesome characters of Shift_JIS\n" + " --ic= Specify the input encoding\n" + " --oc= Specify the output encoding\n" + " --hiragana --katakana Hiragana/Katakana Conversion\n" + " --katakana-hiragana Converts each other\n" ); fprintf(HELP_OUTPUT, #ifdef INPUT_OPTION - " --cap-input, --url-input Convert hex after ':' or '%%'\n" + " --{cap, url}-input Convert hex after ':' or '%%'\n" #endif #ifdef NUMCHAR_OPTION - " --numchar-input Convert Unicode Character Reference\n" + " --numchar-input Convert Unicode Character Reference\n" #endif #ifdef UTF8_INPUT_ENABLE " --fb-{skip, html, xml, perl, java, subchar}\n" - " Specify how nkf handles unassigned characters\n" + " Specify unassigned character's replacement\n" #endif ); fprintf(HELP_OUTPUT, #ifdef OVERWRITE - " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n" - " Overwrite original listed files by filtered result\n" - " --overwrite preserves timestamp of original files\n" + " --in-place[=SUF] Overwrite original files\n" + " --overwrite[=SUF] Preserve timestamp of original files\n" #endif - " -g --guess Guess the input code\n" - " --help --version Show this help/the version\n" - " For more information, see also man nkf\n" - "\n"); + " -g --guess Guess the input code\n" + " -v --version Print the version\n" + " --help/-V Print this help / configuration\n" + ); version(); } @@ -1042,7 +1033,7 @@ nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c) int shift = 20; c &= VALUE_MASK; while(shift >= 0){ - if(c >= 1<= NKF_INT32_C(1)<= 0){ (*f)(0, bin2hex(c>>shift)); shift -= 4; @@ -1140,7 +1131,7 @@ static const struct { {"euc","e"}, {"euc-input","E"}, {"fj","jm"}, - {"help","v"}, + {"help",""}, {"jis","j"}, {"jis-input","J"}, {"mac","sLm"}, @@ -1150,7 +1141,7 @@ static const struct { {"sjis","s"}, {"sjis-input","S"}, {"unix","eLu"}, - {"version","V"}, + {"version","v"}, {"windows","sLw"}, {"hiragana","h1"}, {"katakana","h2"}, @@ -1220,6 +1211,7 @@ set_input_encoding(nkf_encoding *enc) case CP50220: case CP50221: case CP50222: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = TRUE; #endif @@ -1241,6 +1233,7 @@ set_input_encoding(nkf_encoding *enc) case SHIFT_JIS: break; case WINDOWS_31J: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = TRUE; #endif @@ -1262,6 +1255,7 @@ set_input_encoding(nkf_encoding *enc) case EUCJP_NKF: break; case CP51932: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 cp51932_f = TRUE; #endif @@ -1332,7 +1326,7 @@ set_output_encoding(nkf_encoding *enc) { switch (nkf_enc_to_index(enc)) { case CP50220: - x0201_f = TRUE; + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif @@ -1341,6 +1335,7 @@ set_output_encoding(nkf_encoding *enc) #endif break; case CP50221: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif @@ -1348,6 +1343,11 @@ set_output_encoding(nkf_encoding *enc) ms_ucs_map_f = UCS_MAP_CP932; #endif break; + case ISO_2022_JP: +#ifdef SHIFTJIS_CP932 + if (cp932inv_f == TRUE) cp932inv_f = FALSE; +#endif + break; case ISO_2022_JP_1: x0212_f = TRUE; #ifdef SHIFTJIS_CP932 @@ -1364,6 +1364,7 @@ set_output_encoding(nkf_encoding *enc) case SHIFT_JIS: break; case WINDOWS_31J: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef UTF8_OUTPUT_ENABLE ms_ucs_map_f = UCS_MAP_CP932; #endif @@ -1392,6 +1393,7 @@ set_output_encoding(nkf_encoding *enc) #endif break; case CP51932: + if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */ #ifdef SHIFTJIS_CP932 if (cp932inv_f == TRUE) cp932inv_f = FALSE; #endif @@ -1442,6 +1444,7 @@ set_output_encoding(nkf_encoding *enc) output_endian = ENDIAN_LITTLE; output_bom_f = TRUE; break; + case UTF_32: case UTF_32BE_BOM: output_bom_f = TRUE; break; @@ -1668,7 +1671,7 @@ nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_ *p3 = 0x80 | ( val & 0x3f); *p4 = 0; } else if (nkf_char_unicode_value_p(val)) { - *p1 = 0xe0 | (val >> 16); + *p1 = 0xf0 | (val >> 18); *p2 = 0x80 | ((val >> 12) & 0x3f); *p3 = 0x80 | ((val >> 6) & 0x3f); *p4 = 0x80 | ( val & 0x3f); @@ -2232,13 +2235,15 @@ nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0) { - return 0; + (*oconv)(c2, c1); + return 16; /* different from w_iconv32 */ } static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0) { - return 0; + (*oconv)(c2, c1); + return 32; /* different from w_iconv16 */ } static size_t @@ -3000,6 +3005,7 @@ typedef struct { nkf_char broken_state; nkf_buf_t *broken_buf; nkf_char mimeout_state; + nkf_buf_t *nfc_buf; } nkf_state_t; static nkf_state_t *nkf_state = NULL; @@ -3012,11 +3018,13 @@ nkf_state_init(void) if (nkf_state) { nkf_buf_clear(nkf_state->std_gc_buf); nkf_buf_clear(nkf_state->broken_buf); + nkf_buf_clear(nkf_state->nfc_buf); } else { nkf_state = nkf_xmalloc(sizeof(nkf_state_t)); nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE); nkf_state->broken_buf = nkf_buf_new(3); + nkf_state->nfc_buf = nkf_buf_new(9); } nkf_state->broken_state = 0; nkf_state->mimeout_state = 0; @@ -3049,23 +3057,23 @@ std_putc(nkf_char c) } #endif /*WIN32DLL*/ -static unsigned char hold_buf[HOLD_SIZE*2]; +static nkf_char hold_buf[HOLD_SIZE*2]; static int hold_count = 0; static nkf_char push_hold_buf(nkf_char c2) { if (hold_count >= HOLD_SIZE*2) return (EOF); - hold_buf[hold_count++] = (unsigned char)c2; + hold_buf[hold_count++] = c2; return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); } static int -h_conv(FILE *f, int c1, int c2) +h_conv(FILE *f, nkf_char c1, nkf_char c2) { - int ret, c4, c3; + int ret; int hold_index; - + nkf_char c3, c4; /** it must NOT be in the kanji shifte sequence */ /** it must NOT be written in JIS7 */ @@ -3115,7 +3123,11 @@ h_conv(FILE *f, int c1, int c2) hold_index = 0; while (hold_index < hold_count){ c1 = hold_buf[hold_index++]; - if (c1 <= DEL){ + if (nkf_char_unicode_p(c1)) { + (*oconv)(0, c1); + continue; + } + else if (c1 <= DEL){ (*iconv)(0, c1, 0); continue; }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){ @@ -3141,18 +3153,16 @@ h_conv(FILE *f, int c1, int c2) } else if ((c3 = (*i_getc)(f)) == EOF) { ret = EOF; break; - } else { - code_status(c3); - if (hold_index < hold_count){ - c4 = hold_buf[hold_index++]; - } else if ((c4 = (*i_getc)(f)) == EOF) { - c3 = ret = EOF; - break; - } else { - code_status(c4); - (*iconv)(c1, c2, (c3<<8)|c4); - } } + code_status(c3); + if (hold_index < hold_count){ + c4 = hold_buf[hold_index++]; + } else if ((c4 = (*i_getc)(f)) == EOF) { + c3 = ret = EOF; + break; + } + code_status(c4); + (*iconv)(c1, c2, (c3<<8)|c4); break; case -1: /* 3 bytes EUC or UTF-8 */ @@ -3352,6 +3362,40 @@ eol_conv(nkf_char c2, nkf_char c1) else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1); } +static void +put_newline(void (*func)(nkf_char)) +{ + switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { + case CRLF: + (*func)(0x0D); + (*func)(0x0A); + break; + case CR: + (*func)(0x0D); + break; + case LF: + (*func)(0x0A); + break; + } +} + +static void +oconv_newline(void (*func)(nkf_char, nkf_char)) +{ + switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { + case CRLF: + (*func)(0, 0x0D); + (*func)(0, 0x0A); + break; + case CR: + (*func)(0, 0x0D); + break; + case LF: + (*func)(0, 0x0A); + break; + } +} + /* Return value of fold_conv() @@ -3428,9 +3472,7 @@ fold_conv(nkf_char c2, nkf_char c1) f_prev = LF; f_line = 0; fold_state = LF; /* output newline and clear */ - } else if ( (c2==0 && c1==SP)|| - (c2==0 && c1==TAB)|| - (c2=='!'&& c1=='!')) { + } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) { /* X0208 kankaku or ascii space */ if (f_prev == SP) { fold_state = 0; /* remove duplicate spaces */ @@ -3526,13 +3568,13 @@ fold_conv(nkf_char c2, nkf_char c1) /* terminator process */ switch(fold_state) { case LF: - OCONV_NEWLINE((*o_fconv)); + oconv_newline(o_fconv); (*o_fconv)(c2,c1); break; case 0: return; case CR: - OCONV_NEWLINE((*o_fconv)); + oconv_newline(o_fconv); break; case TAB: case SP: @@ -3819,6 +3861,7 @@ static const unsigned char *mime_pattern[] = { (const unsigned char *)"\075?ISO-8859-1?Q?", (const unsigned char *)"\075?ISO-8859-1?B?", (const unsigned char *)"\075?ISO-2022-JP?B?", + (const unsigned char *)"\075?ISO-2022-JP?B?", (const unsigned char *)"\075?ISO-2022-JP?Q?", #if defined(UTF8_INPUT_ENABLE) (const unsigned char *)"\075?UTF-8?B?", @@ -3839,7 +3882,7 @@ nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = { }; static const nkf_char mime_encode[] = { - EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, + EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K, #if defined(UTF8_INPUT_ENABLE) UTF_8, UTF_8, #endif @@ -3848,7 +3891,7 @@ static const nkf_char mime_encode[] = { }; static const nkf_char mime_encode_method[] = { - 'B', 'B','Q', 'B', 'B', 'Q', + 'B', 'B','Q', 'B', 'B', 'B', 'Q', #if defined(UTF8_INPUT_ENABLE) 'B', 'Q', #endif @@ -4273,14 +4316,14 @@ nfc_getc(FILE *f) { nkf_char (*g)(FILE *f) = i_nfc_getc; nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc; - nkf_buf_t *buf = nkf_buf_new(9); + nkf_buf_t *buf = nkf_state->nfc_buf; const unsigned char *array; int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1; nkf_char c = (*g)(f); if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c; - nkf_buf_push(buf, (unsigned char)c); + nkf_buf_push(buf, c); do { while (lower <= upper) { int mid = (lower+upper) / 2; @@ -4316,7 +4359,6 @@ nfc_getc(FILE *f) while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f); c = nkf_buf_pop(buf); - nkf_buf_dispose(buf); return c; } @@ -4398,7 +4440,7 @@ mime_getc(FILE *f) case LF: case CR: if (c1==LF) { - if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) { + if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { i_ungetc(SP,f); continue; } else { @@ -4407,7 +4449,7 @@ mime_getc(FILE *f) c1 = LF; } else { if ((c1=(*i_getc)(f))!=EOF && c1 == LF) { - if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) { + if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { i_ungetc(SP,f); continue; } else { @@ -4443,7 +4485,7 @@ mime_getc(FILE *f) } if (c1=='='&&c20 - && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB - || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) { + if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) { i++; } } for (;i 73){ (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); + oconv_newline(o_base64conv); (*o_base64conv)(0,SP); base64_count = 1; } } else { - if (base64_count + mimeout_state.count/3*4> 66) { + if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) { (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); + oconv_newline(o_base64conv); (*o_base64conv)(0,SP); base64_count = 1; mimeout_mode = -1; @@ -4669,7 +4708,7 @@ mime_prechar(nkf_char c2, nkf_char c1) mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B'; open_mime(output_mode); (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); + oconv_newline(o_base64conv); (*o_base64conv)(0,SP); base64_count = 1; mimeout_mode = -1; @@ -4767,14 +4806,14 @@ mime_putc(nkf_char c) if (base64_count > 71){ if (c!=CR && c!=LF) { (*o_mputc)('='); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); } base64_count = 0; } }else{ if (base64_count > 71){ eof_mime(); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; } if (c == EOF) { /* c==EOF */ @@ -4836,7 +4875,7 @@ mime_putc(nkf_char c) } else if (c <= SP) { close_mime(); if (base64_count > 70) { - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; } if (!nkf_isblank(c)) { @@ -4846,7 +4885,7 @@ mime_putc(nkf_char c) } else { if (base64_count > 70) { close_mime(); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); (*o_mputc)(SP); base64_count = 1; open_mime(output_mode); @@ -4856,14 +4895,17 @@ mime_putc(nkf_char c) return; } } - (*o_mputc)(c); - base64_count++; + if (c != 0x1B) { + (*o_mputc)(c); + base64_count++; + return; + } } - return; } if (mimeout_mode <= 0) { - if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { + if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || + output_mode == UTF_8)) { if (nkf_isspace(c)) { int flag = 0; if (mimeout_mode == -1) { @@ -4908,7 +4950,7 @@ mime_putc(nkf_char c) } if (i == 0 || i == mimeout_state.count - len) { - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; if (!nkf_isspace(mimeout_state.buf[0])){ (*o_mputc)(SP); @@ -4920,7 +4962,7 @@ mime_putc(nkf_char c) for (j = 0; j <= i; ++j) { (*o_mputc)(mimeout_state.buf[j]); } - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 1; for (; j <= mimeout_state.count; ++j) { mimeout_state.buf[j - i] = mimeout_state.buf[j]; @@ -4954,14 +4996,15 @@ mime_putc(nkf_char c) } }else{ /* mimeout_mode == 'B', 1, 2 */ - if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { + if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || + output_mode == UTF_8)) { if (lastchar == CR || lastchar == LF){ if (nkf_isblank(c)) { for (i=0;i DEL) { /* 8 bit code */ if (!estab_f && !iso8859_f) { @@ -5537,7 +5592,7 @@ kanji_convert(FILE *f) SKIP; } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) { if ((c1 = (*i_getc)(f)) == EOF) { - /* (*oconv)(0, ESC); don't send bogus code */ + (*oconv)(0, ESC); LAST; } else if (c1 == '&') { @@ -5667,7 +5722,7 @@ kanji_convert(FILE *f) } else if (c1 == ESC && iconv == s_iconv) { /* ESC in Shift_JIS */ if ((c1 = (*i_getc)(f)) == EOF) { - /* (*oconv)(0, ESC); don't send bogus code */ + (*oconv)(0, ESC); LAST; } else if (c1 == '$') { /* J-PHONE emoji */ @@ -5792,6 +5847,7 @@ kanji_convert(FILE *f) /* goto next_word */ } +finished: /* epilogue */ (*iconv)(EOF, 0, 0); if (!input_codename) @@ -5863,6 +5919,12 @@ options(unsigned char *cp) cp_back = cp; cp = (unsigned char *)long_option[i].alias; }else{ +#ifndef PERL_XS + if (strcmp(long_option[i].name, "help") == 0){ + usage(); + exit(EXIT_SUCCESS); + } +#endif if (strcmp(long_option[i].name, "ic=") == 0){ enc = nkf_enc_find((char *)p); if (!enc) continue; @@ -6101,7 +6163,7 @@ options(unsigned char *cp) output_encoding = nkf_enc_from_index(EUCJP_NKF); continue; case 's': /* SJIS output */ - output_encoding = nkf_enc_from_index(WINDOWS_31J); + output_encoding = nkf_enc_from_index(SHIFT_JIS); continue; case 'l': /* ISO8859 Latin-1 support, no conversion */ iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */ @@ -6111,7 +6173,8 @@ options(unsigned char *cp) if (*cp=='@'||*cp=='B') kanji_intro = *cp++; continue; - case 'o': /* ASCII IN ESC-(-J/B */ + case 'o': /* ASCII IN ESC-(-J/B/H */ + /* ESC ( H was used in initial JUNET messages */ if (*cp=='J'||*cp=='B'||*cp=='H') ascii_intro = *cp++; continue; @@ -6136,15 +6199,15 @@ options(unsigned char *cp) #ifndef PERL_XS case 'V': show_configuration(); - exit(1); + exit(EXIT_SUCCESS); break; case 'v': - usage(); - exit(1); + version(); + exit(EXIT_SUCCESS); break; #endif #ifdef UTF8_OUTPUT_ENABLE - case 'w': /* UTF-8 output */ + case 'w': /* UTF-{8,16,32} output */ if (cp[0] == '8') { cp++; if (cp[0] == '0'){ @@ -6169,19 +6232,18 @@ options(unsigned char *cp) if (cp[0]=='L') { cp++; output_endian = ENDIAN_LITTLE; + output_bom_f = TRUE; } else if (cp[0] == 'B') { cp++; - } else { - output_encoding = nkf_enc_from_index(enc_idx); - continue; + output_bom_f = TRUE; } if (cp[0] == '0'){ + output_bom_f = FALSE; cp++; enc_idx = enc_idx == UTF_16 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE) : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE); } else { - output_bom_f = TRUE; enc_idx = enc_idx == UTF_16 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM) : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM); @@ -6230,8 +6292,8 @@ options(unsigned char *cp) case 'E': /* EUC-JP input */ input_encoding = nkf_enc_from_index(EUCJP_NKF); continue; - case 'S': /* Windows-31J input */ - input_encoding = nkf_enc_from_index(WINDOWS_31J); + case 'S': /* Shift_JIS input */ + input_encoding = nkf_enc_from_index(SHIFT_JIS); continue; case 'Z': /* Convert X0208 alphabet to asii */ /* alpha_f @@ -6241,10 +6303,10 @@ options(unsigned char *cp) bit:3 Convert HTML Entity bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana */ - while ('0'<= *cp && *cp <='9') { + while ('0'<= *cp && *cp <='4') { alpha_f |= 1 << (*cp++ - '0'); } - if (!alpha_f) alpha_f = 1; + alpha_f |= 1; continue; case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */ x0201_f = FALSE; /* No X0201->X0208 conversion */