X-Git-Url: http://git.sourceforge.jp/view?p=nkf%2Fnkf.git;a=blobdiff_plain;f=nkf.c;h=5f52748e2b05a59e0eb90c81206250eec799f405;hp=d1d23ebcff58ce9f16aeaf1d330d620231d9654d;hb=caa533415c6aea5db29b4fa89e67718c7bdf2f54;hpb=5912ff367d97d60339a5e64e91496c2ec4efacc7 diff --git a/nkf.c b/nkf.c index d1d23eb..5f52748 100644 --- a/nkf.c +++ b/nkf.c @@ -21,7 +21,7 @@ * 3. This notice may not be removed or altered from any source distribution. */ #define NKF_VERSION "2.0.9" -#define NKF_RELEASE_DATE "2009-01-20" +#define NKF_RELEASE_DATE "2009-04-26" #define COPY_RIGHT \ "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \ "Copyright (C) 1996-2009, The nkf Project." @@ -295,7 +295,7 @@ struct { && (c != '(') && (c != ')') && (c != '.') && (c != 0x22))) #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END) -#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F)) +#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F) #define HOLD_SIZE 1024 #if defined(INT_IS_SHORT) @@ -468,6 +468,8 @@ struct input_code input_code_list[] = { {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0}, #ifdef UTF8_INPUT_ENABLE {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0}, + {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0}, + {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0}, #endif {0} }; @@ -802,7 +804,7 @@ nkf_default_encoding() typedef struct { long capa; long len; - unsigned char *ptr; + nkf_char *ptr; } nkf_buf_t; static nkf_buf_t * @@ -841,7 +843,7 @@ nkf_buf_clear(nkf_buf_t *buf) } static void -nkf_buf_push(nkf_buf_t *buf, unsigned char c) +nkf_buf_push(nkf_buf_t *buf, nkf_char c) { if (buf->capa <= buf->len) { exit(EXIT_FAILURE); @@ -873,17 +875,20 @@ usage(void) { fprintf(HELP_OUTPUT, "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n" - " j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n" #ifdef UTF8_OUTPUT_ENABLE - " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n" + " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" + " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n" +#else #endif - " J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n" #ifdef UTF8_INPUT_ENABLE - " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n" + " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" + " UTF option is -W[8,[16,32][B,L]]\n" +#else + " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" #endif ); fprintf(HELP_OUTPUT, - " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n" + " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n" " M[BQ] MIME encode [B:base64 Q:quoted]\n" " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n" ); @@ -898,32 +903,31 @@ usage(void) " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n" ); fprintf(HELP_OUTPUT, - "Long name options\n" - " --ic= --oc=\n" - " Specify the input or output codeset\n" - " --hiragana --katakana --katakana-hiragana\n" - " To Hiragana/Katakana Conversion\n" + " --ic= Specify the input encoding\n" + " --oc= Specify the output encoding\n" + " --hiragana --katakana Hiragana/Katakana Conversion\n" + " --katakana-hiragana Converts each other\n" ); fprintf(HELP_OUTPUT, #ifdef INPUT_OPTION - " --cap-input, --url-input Convert hex after ':' or '%%'\n" + " --{cap, url}-input Convert hex after ':' or '%%'\n" #endif #ifdef NUMCHAR_OPTION - " --numchar-input Convert Unicode Character Reference\n" + " --numchar-input Convert Unicode Character Reference\n" #endif #ifdef UTF8_INPUT_ENABLE " --fb-{skip, html, xml, perl, java, subchar}\n" - " Specify how nkf handles unassigned characters\n" + " Specify unassigned character's replacement\n" #endif ); fprintf(HELP_OUTPUT, #ifdef OVERWRITE - " --in-place[=SUF] Overwrite original listed files by filtered result\n" - " --overwrite[=SUF] in-place and preserve timestamp of original files\n" + " --in-place[=SUF] Overwrite original files\n" + " --overwrite[=SUF] Preserve timestamp of original files\n" #endif - " -g --guess Guess the input code\n" - " -v --version print the version\n" - " --help/-V print this help / configuration\n" + " -g --guess Guess the input code\n" + " -v --version Print the version\n" + " --help/-V Print this help / configuration\n" ); version(); } @@ -1424,6 +1428,7 @@ set_output_encoding(nkf_encoding *enc) output_endian = ENDIAN_LITTLE; output_bom_f = TRUE; break; + case UTF_32: case UTF_32BE_BOM: output_bom_f = TRUE; break; @@ -1650,7 +1655,7 @@ nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_ *p3 = 0x80 | ( val & 0x3f); *p4 = 0; } else if (nkf_char_unicode_value_p(val)) { - *p1 = 0xe0 | (val >> 16); + *p1 = 0xf0 | (val >> 18); *p2 = 0x80 | ((val >> 12) & 0x3f); *p3 = 0x80 | ((val >> 6) & 0x3f); *p4 = 0x80 | ( val & 0x3f); @@ -3126,18 +3131,16 @@ h_conv(FILE *f, int c1, int c2) } else if ((c3 = (*i_getc)(f)) == EOF) { ret = EOF; break; - } else { - code_status(c3); - if (hold_index < hold_count){ - c4 = hold_buf[hold_index++]; - } else if ((c4 = (*i_getc)(f)) == EOF) { - c3 = ret = EOF; - break; - } else { - code_status(c4); - (*iconv)(c1, c2, (c3<<8)|c4); - } } + code_status(c3); + if (hold_index < hold_count){ + c4 = hold_buf[hold_index++]; + } else if ((c4 = (*i_getc)(f)) == EOF) { + c3 = ret = EOF; + break; + } + code_status(c4); + (*iconv)(c1, c2, (c3<<8)|c4); break; case -1: /* 3 bytes EUC or UTF-8 */ @@ -4263,7 +4266,7 @@ nfc_getc(FILE *f) if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c; - nkf_buf_push(buf, (unsigned char)c); + nkf_buf_push(buf, c); do { while (lower <= upper) { int mid = (lower+upper) / 2; @@ -5442,6 +5445,12 @@ kanji_convert(FILE *f) if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) { /* CP5022x */ MORE; + }else if (input_codename && input_codename[0] == 'I' && + 0xA1 <= c1 && c1 <= 0xDF) { + /* JIS X 0201 Katakana in 8bit JIS */ + c2 = JIS_X_0201_1976_K; + c1 &= 0x7f; + SEND; } else if (c1 > DEL) { /* 8 bit code */ if (!estab_f && !iso8859_f) { @@ -5842,10 +5851,12 @@ options(unsigned char *cp) cp_back = cp; cp = (unsigned char *)long_option[i].alias; }else{ +#ifndef PERL_XS if (strcmp(long_option[i].name, "help") == 0){ usage(); exit(EXIT_SUCCESS); } +#endif if (strcmp(long_option[i].name, "ic=") == 0){ enc = nkf_enc_find((char *)p); if (!enc) continue; @@ -6155,9 +6166,6 @@ options(unsigned char *cp) output_endian = ENDIAN_LITTLE; } else if (cp[0] == 'B') { cp++; - } else { - output_encoding = nkf_enc_from_index(enc_idx); - continue; } if (cp[0] == '0'){ cp++; @@ -6228,6 +6236,7 @@ options(unsigned char *cp) while ('0'<= *cp && *cp <='9') { alpha_f |= 1 << (*cp++ - '0'); } + if (alpha_f & ((1 << 2) | (1 << 3))) alpha_f |= 1; if (!alpha_f) alpha_f = 1; continue; case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */