** E-Mail: furukawa@tcp-ip.or.jp
** \e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#\e(B
***********************************************************************/
-/* $Id: nkf.c,v 1.108 2006/09/15 08:06:14 naruse Exp $ */
+/* $Id: nkf.c,v 1.117 2006/11/04 14:35:25 naruse Exp $ */
#define NKF_VERSION "2.0.8"
-#define NKF_RELEASE_DATE "2006-09-15"
+#define NKF_RELEASE_DATE "2006-11-04"
#include "config.h"
#include "utf8tbl.h"
{"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
#ifdef UTF8_INPUT_ENABLE
{"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
- {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},\r
- {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},\r
+ {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
+ {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
#endif
{0}
};
}
} else {
int nfiles = argc;
+ int is_argument_error = FALSE;
while (argc--) {
is_inputcode_mixed = FALSE;
is_inputcode_set = FALSE;
#endif
if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
perror(*--argv);
- return(-1);
+ *argv++;
+ is_argument_error = TRUE;
+ continue;
} else {
#ifdef OVERWRITE
int fd = 0;
#endif
}
}
+ if (is_argument_error)
+ return(-1);
}
#ifdef EASYWIN /*Easy Win */
if (file_out_f == FALSE)
codeset[i] = nkf_toupper(p[i]);
}
codeset[i] = 0;
- if(strcmp(codeset, "ISO-2022-JP") == 0 ||
- strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
+ if(strcmp(codeset, "ISO-2022-JP") == 0){
+ input_f = JIS_INPUT;
+ }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
strcmp(codeset, "CP50220") == 0 ||
strcmp(codeset, "CP50221") == 0 ||
- strcmp(codeset, "CP50222") == 0 ||
- strcmp(codeset, "ISO-2022-JP-MS") == 0){
+ strcmp(codeset, "CP50222") == 0){
input_f = JIS_INPUT;
+#ifdef SHIFTJIS_CP932
+ cp51932_f = TRUE;
+#endif
+#ifdef UTF8_OUTPUT_ENABLE
+ ms_ucs_map_f = UCS_MAP_CP932;
+#endif
}else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
input_f = JIS_INPUT;
#ifdef X0212_ENABLE
x0213_f = TRUE;
}else if(strcmp(codeset, "SHIFT_JIS") == 0){
input_f = SJIS_INPUT;
- if (x0201_f==NO_X0201) x0201_f=TRUE;
}else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
strcmp(codeset, "CSWINDOWS31J") == 0 ||
strcmp(codeset, "CP932") == 0 ||
strcmp(codeset, "MS932") == 0){
input_f = SJIS_INPUT;
- x0201_f = FALSE;
#ifdef SHIFTJIS_CP932
cp51932_f = TRUE;
#endif
input_f = EUC_INPUT;
}else if(strcmp(codeset, "CP51932") == 0){
input_f = EUC_INPUT;
- x0201_f = FALSE;
#ifdef SHIFTJIS_CP932
cp51932_f = TRUE;
#endif
strcmp(codeset, "EUCJP-MS") == 0 ||
strcmp(codeset, "EUCJPMS") == 0){
input_f = EUC_INPUT;
- x0201_f = FALSE;
#ifdef SHIFTJIS_CP932
cp51932_f = FALSE;
#endif
}else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
strcmp(codeset, "EUCJP-ASCII") == 0){
input_f = EUC_INPUT;
- x0201_f = FALSE;
#ifdef SHIFTJIS_CP932
cp51932_f = FALSE;
#endif
cp51932_f = FALSE;
cp932inv_f = FALSE;
#endif
- if (x0201_f==NO_X0201) x0201_f=TRUE;
}else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
strcmp(codeset, "EUC-JIS-2004") == 0){
input_f = EUC_INPUT;
- x0201_f = FALSE;
x0213_f = TRUE;
#ifdef SHIFTJIS_CP932
cp51932_f = FALSE;
codeset[i] = nkf_toupper(p[i]);
}
codeset[i] = 0;
- if(strcmp(codeset, "ISO-2022-JP") == 0 ||
- strcmp(codeset, "CP50220") == 0){
+ if(strcmp(codeset, "ISO-2022-JP") == 0){
output_conv = j_oconv;
}else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
output_conv = j_oconv;
+ x0201_f = FALSE;
no_cp932ext_f = TRUE;
- }else if(strcmp(codeset, "CP50221") == 0 ||
- strcmp(codeset, "ISO-2022-JP-MS") == 0){
+#ifdef SHIFTJIS_CP932
+ cp51932_f = TRUE;
+#endif
+#ifdef UTF8_OUTPUT_ENABLE
+ ms_ucs_map_f = UCS_MAP_CP932;
+#endif
+ }else if(strcmp(codeset, "CP50220") == 0){
+ output_conv = j_oconv;
+#ifdef SHIFTJIS_CP932
+ cp51932_f = TRUE;
+#endif
+#ifdef UTF8_OUTPUT_ENABLE
+ ms_ucs_map_f = UCS_MAP_CP932;
+#endif
+ }else if(strcmp(codeset, "CP50221") == 0){
output_conv = j_oconv;
x0201_f = FALSE;
+#ifdef SHIFTJIS_CP932
+ cp51932_f = TRUE;
+#endif
+#ifdef UTF8_OUTPUT_ENABLE
+ ms_ucs_map_f = UCS_MAP_CP932;
+#endif
}else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
output_conv = j_oconv;
#ifdef X0212_ENABLE
struct input_code *result = 0;
struct input_code *p = input_code_list;
while (p->name){
- if (!p->status_func) {\r
- ++p;\r
- continue;\r
- }\r
+ if (!p->status_func) {
+ ++p;
+ continue;
+ }
if (!p->status_func)
continue;
(p->status_func)(p, c);
if(!input_f){
set_iconv(TRUE, w_iconv32);
}
- input_endian = ENDIAN_BIG;
- return;
+ if (iconv == w_iconv32) {
+ input_endian = ENDIAN_BIG;
+ return;
+ }
+ (*i_ungetc)(0xFF,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xFE,f);
}else if(c2 == 0xFF){
if(!input_f){
set_iconv(TRUE, w_iconv32);
}
- input_endian = ENDIAN_2143;
- return;
+ if (iconv == w_iconv32) {
+ input_endian = ENDIAN_2143;
+ return;
+ }
+ (*i_ungetc)(0xFF,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xFF,f);
}else (*i_ungetc)(c2,f);
if(!input_f){
set_iconv(TRUE, w_iconv);
}
- return;
+ if (iconv == w_iconv) {
+ return;
+ }
+ (*i_ungetc)(0xBF,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xBB,f);
}else (*i_ungetc)(c2,f);
if(!input_f){
set_iconv(TRUE, w_iconv32);
}
- input_endian = ENDIAN_3412;
- return;
+ if (iconv == w_iconv32) {
+ input_endian = ENDIAN_3412;
+ return;
+ }
+ (*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
if(!input_f){
set_iconv(TRUE, w_iconv16);
}
- input_endian = ENDIAN_BIG;
- return;
+ if (iconv == w_iconv16) {
+ input_endian = ENDIAN_BIG;
+ return;
+ }
+ (*i_ungetc)(0xFF,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xFE,f);
break;
if(!input_f){
set_iconv(TRUE, w_iconv32);
}
- input_endian = ENDIAN_LITTLE;
- return;
+ if (iconv == w_iconv32) {
+ input_endian = ENDIAN_LITTLE;
+ return;
+ }
+ (*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
if(!input_f){
set_iconv(TRUE, w_iconv16);
}
- input_endian = ENDIAN_LITTLE;
- return;
+ if (iconv == w_iconv16) {
+ input_endian = ENDIAN_LITTLE;
+ return;
+ }
+ (*i_ungetc)(0xFE,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xFF,f);
break;
else
c2 = 0;
NEXT;
- } else
- /* in case of already established */
- if (c1 < AT) {
- /* ignore bogus code */
- c2 = 0;
- NEXT;
- } else
- SEND;
+ } else {
+ /* in case of already established */
+ if (c1 < AT && !(X0208 && 0x80 <= c2 && c2 <= 0x92)) {
+ /* ignore bogus code and not CP5022x UCD */
+ c2 = 0;
+ NEXT;
+ } else {
+ SEND;
+ }
+ }
} else
/* second byte, 7 bit code */
/* it might be kanji shitfted */
c0 <<= 8;
if ((c3 = (*i_getc)(f)) != EOF) {
c0 |= c3;
- } else c1 = EOF;
- } else c1 = EOF;
+ } else c2 = EOF;
+ } else c2 = EOF;
}
- }
+ } else c2 = EOF;
} else {
if ((c2 = (*i_getc)(f)) != EOF) {
if (0xD8 <= c2 && c2 <= 0xDB) {
if ((c0 = (*i_getc)(f)) != EOF) {
c0 <<= 8;
c0 |= c3;
- } else c1 = EOF;
- } else c1 = EOF;
+ } else c2 = EOF;
+ } else c2 = EOF;
}
- } else c1 = EOF;
+ } else c2 = EOF;
}
SEND;
} else if(iconv == w_iconv32){
}
c2 = 0;
}else{
- c1 = EOF;
+ c2 = EOF;
}
SEND;
} else
(*oconv)(0, ESC);
SEND;
}
+ } else if (c1 == ESC && iconv == s_iconv) {
+ /* ESC in Shift_JIS */
+ if ((c1 = (*i_getc)(f)) == EOF) {
+ /* (*oconv)(0, ESC); don't send bogus code */
+ LAST;
+ } else if (c1 == '$') {
+ /* J-PHONE emoji */
+ if ((c1 = (*i_getc)(f)) == EOF) {
+ /*
+ (*oconv)(0, ESC); don't send bogus code
+ (*oconv)(0, '$'); */
+ LAST;
+ } else {
+ if (('E' <= c1 && c1 <= 'G') ||
+ ('O' <= c1 && c1 <= 'Q')) {
+ /*
+ NUM : 0 1 2 3 4 5
+ BYTE: G E F O P Q
+ C%7 : 1 6 0 2 3 4
+ C%7 : 0 1 2 3 4 5 6
+ NUM : 2 0 3 4 5 X 1
+ */
+ static const int jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
+ c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SPACE + 0xE000 + CLASS_UNICODE;
+ while ((c1 = (*i_getc)(f)) != EOF) {
+ if (SPACE <= c1 && c1 <= 'z') {
+ (*oconv)(0, c1 + c0);
+ } else break; /* c1 == SO */
+ }
+ }
+ }
+ if (c1 == EOF) LAST;
+ NEXT;
+ } else {
+ /* lonely ESC */
+ (*oconv)(0, ESC);
+ SEND;
+ }
} else if ((c1 == NL || c1 == CR) && broken_f&4) {
input_mode = ASCII; set_iconv(FALSE, 0);
SEND;
}
c1 = CR;
SEND;
+ } else if (c1 == DEL && input_mode == X0208 ) {
+ /* CP5022x */
+ c2 = c1;
+ NEXT;
} else
SEND;
}
break;
case X0208:
case X0213_1:
+ if (ms_ucs_map_f == UCS_MAP_CP932 &&
+ 0x7F <= c2 && c2 <= 0x92 &&
+ 0x21 <= c1 && c1 <= 0x7E) {
+ /* CP932 UDC */
+ if(c1 == 0x7F) return 0;
+ c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
+ c2 = 0;
+ }
(*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
break;
#ifdef X0212_ENABLE
c1 &= 0x7f;
} else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
/* NOP */
+ } else if (ms_ucs_map_f == UCS_MAP_CP932 &&
+ 0xF0 <= c2 && c2 <= 0xF9 &&
+ 0x40 <= c1 && c1 <= 0xFC) {
+ /* CP932 UDC */
+ if(c1 == 0x7F) return 0;
+ c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
+ c2 = 0;
} else {
nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
if (ret) return ret;
if (c0 == 0){
return -1;
}
- c2 = (c2 << 8) | (c1 & 0x7f);
- c1 = c0 & 0x7f;
+ if (ms_ucs_map_f == UCS_MAP_MS && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
+ /* encoding is eucJP-ms, so invert to Unicode Private User Area */
+ c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
+ c2 = 0;
+ } else {
+ c2 = (c2 << 8) | (c1 & 0x7f);
+ c1 = c0 & 0x7f;
#ifdef SHIFTJIS_CP932
- if (cp51932_f){
- nkf_char s2, s1;
- if (e2s_conv(c2, c1, &s2, &s1) == 0){
- s2e_conv(s2, s1, &c2, &c1);
- if (c2 < 0x100){
- c1 &= 0x7f;
- c2 &= 0x7f;
- }
- }
- }
+ if (cp51932_f){
+ nkf_char s2, s1;
+ if (e2s_conv(c2, c1, &s2, &s1) == 0){
+ s2e_conv(s2, s1, &c2, &c1);
+ if (c2 < 0x100){
+ c1 &= 0x7f;
+ c2 &= 0x7f;
+ }
+ }
+ }
#endif /* SHIFTJIS_CP932 */
+ }
#endif /* X0212_ENABLE */
} else if (c2 == SSO){
c2 = X0201;
} else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
/* NOP */
} else {
- c1 &= 0x7f;
- c2 &= 0x7f;
+ if (ms_ucs_map_f == UCS_MAP_MS && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
+ /* encoding is eucJP-ms, so invert to Unicode Private User Area */
+ c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
+ c2 = 0;
+ } else {
+ c1 &= 0x7f;
+ c2 &= 0x7f;
+ }
}
(*oconv)(c2, c1);
return 0;
nkf_char val = e2w_conv(c2, c1);
c2 = (val >> 8) & 0xff;
c1 = val & 0xff;
+ if (!val) return;
}
if (output_endian == ENDIAN_LITTLE){
(*o_putc)(c1);
#endif
} else if (c2) {
c1 = e2w_conv(c2, c1);
+ if (!c1) return;
}
if (output_endian == ENDIAN_LITTLE){
(*o_putc)( c1 & NKF_INT32_C(0x000000FF));
if (c2 == 0 && is_unicode_capsule(c1)){
w16e_conv(c1, &c2, &c1);
if (c2 == 0 && is_unicode_capsule(c1)){
- if(encode_fallback)(*encode_fallback)(c1);
- return;
+ c2 = c1 & VALUE_MASK;
+ if (ms_ucs_map_f == UCS_MAP_MS &&
+ 0xE000 <= c2 && c2 <= 0xE757) {
+ /* eucJP-ms UDC */
+ c1 &= 0xFFF;
+ c2 = c1 / 94;
+ c2 += c2 < 10 ? 0x75 : 0x8FEB;
+ c1 = 0x21 + c1 % 94;
+ } else {
+ if (encode_fallback) (*encode_fallback)(c1);
+ return;
+ }
}
}
#endif
if (c2 == 0 && is_unicode_capsule(c1)){
w16e_conv(c1, &c2, &c1);
if (c2 == 0 && is_unicode_capsule(c1)){
- if(encode_fallback)(*encode_fallback)(c1);
- return;
- }
+ c2 = c1 & VALUE_MASK;
+ if (ms_ucs_map_f == UCS_MAP_CP932 &&
+ 0xE000 <= c2 && c2 <= 0xE757) {
+ /* CP932 UDC */
+ c1 &= 0xFFF;
+ c2 = c1 / 188 + 0xF0;
+ c1 = c1 % 188;
+ c1 += 0x40 + (c1 > 0x3e);
+ (*o_putc)(c2);
+ (*o_putc)(c1);
+ return;
+ } else {
+ if(encode_fallback)(*encode_fallback)(c1);
+ return;
+ }
+ }
}
#endif
if (c2 == EOF) {
if (c2 == 0 && is_unicode_capsule(c1)){
w16e_conv(c1, &c2, &c1);
if (c2 == 0 && is_unicode_capsule(c1)){
- if(encode_fallback)(*encode_fallback)(c1);
- return;
+ c2 = c1 & VALUE_MASK;
+ if (ms_ucs_map_f == UCS_MAP_CP932 &&
+ 0xE000 <= c2 && c2 <= 0xE757) {
+ /* CP5022x UDC */
+ c1 &= 0xFFF;
+ c2 = 0x7F + c1 / 94;
+ c1 = 0x21 + c1 % 94;
+ } else {
+ if (encode_fallback) (*encode_fallback)(c1);
+ return;
+ }
}
}
#endif
}
(*o_putc)(c1);
} else {
- if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
+ if(ms_ucs_map_f == UCS_MAP_CP932
+ ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
+ : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
if(x0213_f){
if (output_mode!=X0213_1) {
output_mode = X0213_1;