OSDN Git Service

* Fix: other fixes arround cp932 encodings.
authorNARUSE, Yui <naruse@users.sourceforge.jp>
Sun, 12 Nov 2006 05:33:56 +0000 (05:33 +0000)
committerNARUSE, Yui <naruse@users.sourceforge.jp>
Sun, 12 Nov 2006 05:33:56 +0000 (05:33 +0000)
  by MORIYAMA Masayuki
  Message-ID: <454ED961.9070808@miraclelinux.com>

nkf.c
utf8tbl.c

diff --git a/nkf.c b/nkf.c
index 00d36d6..b9f4755 100644 (file)
--- a/nkf.c
+++ b/nkf.c
@@ -39,9 +39,9 @@
 **        E-Mail: furukawa@tcp-ip.or.jp
 **    \e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#\e(B
 ***********************************************************************/
-/* $Id: nkf.c,v 1.118 2006/11/10 09:45:39 naruse Exp $ */
+/* $Id: nkf.c,v 1.119 2006/11/11 20:33:56 naruse Exp $ */
 #define NKF_VERSION "2.0.8"
-#define NKF_RELEASE_DATE "2006-11-04"
+#define NKF_RELEASE_DATE "2006-11-12"
 #include "config.h"
 #include "utf8tbl.h"
 
@@ -548,7 +548,7 @@ static int exec_f = 0;
 
 #ifdef SHIFTJIS_CP932
 /* invert IBM extended characters to others */
-static int cp51932_f = TRUE;
+static int cp51932_f = FALSE;
 
 /* invert NEC-selected IBM extended characters to IBM extended characters */
 static int cp932inv_f = TRUE;
@@ -1269,7 +1269,6 @@ void options(unsigned char *cp)
                        x0213_f = TRUE;
 #ifdef SHIFTJIS_CP932
                        cp51932_f = FALSE;
-                       cp932inv_f = FALSE;
 #endif
                    }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
                             strcmp(codeset, "EUC-JIS-2004") == 0){
@@ -1277,7 +1276,6 @@ void options(unsigned char *cp)
                        x0213_f = TRUE;
 #ifdef SHIFTJIS_CP932
                        cp51932_f = FALSE;
-                       cp932inv_f = FALSE;
 #endif
 #ifdef UTF8_INPUT_ENABLE
                    }else if(strcmp(codeset, "UTF-8") == 0 ||
@@ -1324,7 +1322,7 @@ void options(unsigned char *cp)
                        output_conv = j_oconv;
                        no_cp932ext_f = TRUE;
 #ifdef SHIFTJIS_CP932
-                       cp51932_f = TRUE;
+                       cp932inv_f = FALSE;
 #endif
 #ifdef UTF8_OUTPUT_ENABLE
                        ms_ucs_map_f = UCS_MAP_CP932;
@@ -1333,7 +1331,7 @@ void options(unsigned char *cp)
                        output_conv = j_oconv;
                        x0201_f = TRUE;
 #ifdef SHIFTJIS_CP932
-                       cp51932_f = TRUE;
+                       cp932inv_f = FALSE;
 #endif
 #ifdef UTF8_OUTPUT_ENABLE
                        ms_ucs_map_f = UCS_MAP_CP932;
@@ -1341,7 +1339,7 @@ void options(unsigned char *cp)
                    }else if(strcmp(codeset, "CP50221") == 0){
                        output_conv = j_oconv;
 #ifdef SHIFTJIS_CP932
-                       cp51932_f = TRUE;
+                       cp932inv_f = FALSE;
 #endif
 #ifdef UTF8_OUTPUT_ENABLE
                        ms_ucs_map_f = UCS_MAP_CP932;
@@ -1352,7 +1350,7 @@ void options(unsigned char *cp)
                        x0212_f = TRUE;
 #endif
 #ifdef SHIFTJIS_CP932
-                       cp51932_f = FALSE;
+                       cp932inv_f = FALSE;
 #endif
                    }else if(strcmp(codeset, "ISO-2022-JP-3") == 0){
                        output_conv = j_oconv;
@@ -1361,7 +1359,7 @@ void options(unsigned char *cp)
 #endif
                        x0213_f = TRUE;
 #ifdef SHIFTJIS_CP932
-                       cp51932_f = FALSE;
+                       cp932inv_f = FALSE;
 #endif
                    }else if(strcmp(codeset, "SHIFT_JIS") == 0){
                        output_conv = s_oconv;
@@ -1370,10 +1368,6 @@ void options(unsigned char *cp)
                             strcmp(codeset, "CP932") == 0 ||
                             strcmp(codeset, "MS932") == 0){
                        output_conv = s_oconv;
-#ifdef SHIFTJIS_CP932
-                       cp51932_f = TRUE;
-                       cp932inv_f = TRUE;
-#endif
 #ifdef UTF8_OUTPUT_ENABLE
                        ms_ucs_map_f = UCS_MAP_CP932;
 #endif
@@ -1383,7 +1377,7 @@ void options(unsigned char *cp)
                    }else if(strcmp(codeset, "CP51932") == 0){
                        output_conv = e_oconv;
 #ifdef SHIFTJIS_CP932
-                       cp51932_f = TRUE;
+                       cp932inv_f = FALSE;
 #endif
 #ifdef UTF8_OUTPUT_ENABLE
                        ms_ucs_map_f = UCS_MAP_CP932;
@@ -1395,9 +1389,6 @@ void options(unsigned char *cp)
 #ifdef X0212_ENABLE
                        x0212_f = TRUE;
 #endif
-#ifdef SHIFTJIS_CP932
-                       cp51932_f = FALSE;
-#endif
 #ifdef UTF8_OUTPUT_ENABLE
                        ms_ucs_map_f = UCS_MAP_MS;
 #endif
@@ -1407,9 +1398,6 @@ void options(unsigned char *cp)
 #ifdef X0212_ENABLE
                        x0212_f = TRUE;
 #endif
-#ifdef SHIFTJIS_CP932
-                       cp51932_f = FALSE;
-#endif
 #ifdef UTF8_OUTPUT_ENABLE
                        ms_ucs_map_f = UCS_MAP_ASCII;
 #endif
@@ -1428,7 +1416,7 @@ void options(unsigned char *cp)
 #endif
                        x0213_f = TRUE;
 #ifdef SHIFTJIS_CP932
-                       cp51932_f = FALSE;
+                       cp932inv_f = FALSE;
 #endif
 #ifdef UTF8_OUTPUT_ENABLE
                    }else if(strcmp(codeset, "UTF-8") == 0){
@@ -1682,6 +1670,7 @@ void options(unsigned char *cp)
             continue;
         case 'e':           /* AT&T EUC output */
             output_conv = e_oconv;
+            cp932inv_f = FALSE;
             continue;
         case 's':           /* SJIS output */
             output_conv = s_oconv;
@@ -2561,7 +2550,7 @@ nkf_char kanji_convert(FILE *f)
            code_status(c1);
         if (c2) {
             /* second byte */
-            if (c2 > DEL) {
+            if (c2 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
                 /* in case of 8th bit is on */
                 if (!estab_f&&!mime_decode_mode) {
                     /* in case of not established yet */
@@ -2573,7 +2562,7 @@ nkf_char kanji_convert(FILE *f)
                     NEXT;
                 } else {
                    /* in case of already established */
-                   if (c1 < AT && !(X0208 && 0x80 <= c2 && c2 <= 0x92)) {
+                   if (c1 < AT) {
                        /* ignore bogus code and not CP5022x UCD */
                        c2 = 0;
                        NEXT;
@@ -2650,7 +2639,7 @@ nkf_char kanji_convert(FILE *f)
                 SEND;
            } else
 #endif
-           if (c1 > DEL) {
+           if (c1 > ((input_f == JIS_INPUT && ms_ucs_map_f) ? 0x92 : DEL)) {
                 /* 8 bit code */
                 if (!estab_f && !iso8859_f) {
                     /* not established yet */
@@ -2958,7 +2947,7 @@ nkf_char kanji_convert(FILE *f)
            break;
        case X0208:
        case X0213_1:
-           if (ms_ucs_map_f == UCS_MAP_CP932 &&
+           if (ms_ucs_map_f &&
                0x7F <= c2 && c2 <= 0x92 &&
                0x21 <= c1 && c1 <= 0x7E) {
                /* CP932 UDC */
@@ -3135,7 +3124,7 @@ nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
 #endif
     static const nkf_char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
 #ifdef SHIFTJIS_CP932
-    if (cp51932_f && is_ibmext_in_sjis(c2)){
+    if (!cp932inv_f && is_ibmext_in_sjis(c2)){
 #if 0
         extern const unsigned short shiftjis_cp932[3][189];
 #endif
@@ -3145,6 +3134,17 @@ nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
             c1 = val & 0xff;
         }
     }
+    if (cp932inv_f
+        && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
+#if 0
+        extern const unsigned short cp932inv[2][189];
+#endif
+        nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
+        if (c){
+            c2 = c >> 8;
+            c1 = c & 0xff;
+        }
+    }
 #endif /* SHIFTJIS_CP932 */
 #ifdef X0212_ENABLE
     if (!x0213_f && is_ibmext_in_sjis(c2)){
@@ -3154,7 +3154,7 @@ nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
         val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
         if (val){
             if (val > 0x7FFF){
-                c2 = PREFIX_EUCG3 | (val >> 8);
+                c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
                 c1 = val & 0xff;
             }else{
                 c2 = val >> 8;
@@ -3199,9 +3199,7 @@ nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
        c1 &= 0x7f;
     } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
         /* NOP */
-    } else if (ms_ucs_map_f == UCS_MAP_CP932 &&
-              0xF0 <= c2 && c2 <= 0xF9 &&
-              0x40 <= c1 && c1 <= 0xFC) {
+    } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
        /* CP932 UDC */
        if(c1 == 0x7F) return 0;
        c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
@@ -3223,7 +3221,7 @@ nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
         if (c0 == 0){
             return -1;
         }
-       if (ms_ucs_map_f == UCS_MAP_MS && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
+       if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
            /* encoding is eucJP-ms, so invert to Unicode Private User Area */
            c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
            c2 = 0;
@@ -3250,13 +3248,25 @@ nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
     } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
         /* NOP */
     } else {
-       if (ms_ucs_map_f == UCS_MAP_MS && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
+       if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
            /* encoding is eucJP-ms, so invert to Unicode Private User Area */
            c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
            c2 = 0;
        } else {
            c1 &= 0x7f;
            c2 &= 0x7f;
+#ifdef SHIFTJIS_CP932
+           if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
+               nkf_char s2, s1;
+               if (e2s_conv(c2, c1, &s2, &s1) == 0){
+                   s2e_conv(s2, s1, &c2, &c1);
+                   if (c2 < 0x100){
+                       c1 &= 0x7f;
+                       c2 &= 0x7f;
+                   }
+               }
+           }
+#endif /* SHIFTJIS_CP932 */
         }
     }
     (*oconv)(c2, c1);
@@ -3517,7 +3527,7 @@ nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *
                    if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
                    break;
                }
-           }else if(cp51932_f){
+           }else if(!cp932inv_f){
                switch(c2){
                case 0xC2:
                    if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
@@ -3573,7 +3583,7 @@ nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *
                        if(c0 == 0x8D) return 1;
                        break;
                    case 0xBD:
-                       if(c0 == 0x9E && cp51932_f) return 1;
+                       if(c0 == 0x9E && !cp932inv_f) return 1;
                        break;
                    case 0xBF:
                        if(0xA0 <= c0 && c0 <= 0xA5) return 1;
@@ -3590,7 +3600,7 @@ nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *
        ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
     }else return -1;
 #ifdef SHIFTJIS_CP932
-    if (!ret && cp51932_f && is_eucg3(*p2)) {
+    if (!ret && !cp932inv_f && is_eucg3(*p2)) {
        nkf_char s2, s1;
        if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
            s2e_conv(s2, s1, p2, p1);
@@ -3938,13 +3948,21 @@ void e_oconv(nkf_char c2, nkf_char c1)
         w16e_conv(c1, &c2, &c1);
         if (c2 == 0 && is_unicode_capsule(c1)){
            c2 = c1 & VALUE_MASK;
-           if (ms_ucs_map_f == UCS_MAP_MS &&
-               0xE000 <= c2 && c2 <= 0xE757) {
+           if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
                /* eucJP-ms UDC */
                c1 &= 0xFFF;
                c2 = c1 / 94;
                c2 += c2 < 10 ? 0x75 : 0x8FEB;
                c1 = 0x21 + c1 % 94;
+               if (is_eucg3(c2)){
+                   (*o_putc)(0x8f);
+                   (*o_putc)((c2 & 0x7f) | 0x080);
+                   (*o_putc)(c1 | 0x080);
+               }else{
+                   (*o_putc)((c2 & 0x7f) | 0x080);
+                   (*o_putc)(c1 | 0x080);
+               }
+               return;
            } else {
                if (encode_fallback) (*encode_fallback)(c1);
                return;
@@ -3968,7 +3986,7 @@ void e_oconv(nkf_char c2, nkf_char c1)
     } else if (is_eucg3(c2)){
        output_mode = JAPANESE_EUC;
 #ifdef SHIFTJIS_CP932
-        if (cp51932_f){
+        if (!cp932inv_f){
             nkf_char s2, s1;
             if (e2s_conv(c2, c1, &s2, &s1) == 0){
                 s2e_conv(s2, s1, &c2, &c1);
@@ -4034,7 +4052,7 @@ nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
 {
     nkf_char ndx;
     if (is_eucg3(c2)){
-       ndx = c2 & 0xff;
+       ndx = c2 & 0x7f;
        if (x0213_f){
            if((0x21 <= ndx && ndx <= 0x2F)){
                if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
@@ -4082,8 +4100,7 @@ void s_oconv(nkf_char c2, nkf_char c1)
         w16e_conv(c1, &c2, &c1);
         if (c2 == 0 && is_unicode_capsule(c1)){
            c2 = c1 & VALUE_MASK;
-           if (ms_ucs_map_f == UCS_MAP_CP932 &&
-               0xE000 <= c2 && c2 <= 0xE757) {
+           if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
                /* CP932 UDC */
                c1 &= 0xFFF;
                c2 = c1 / 188 + 0xF0;
@@ -4156,8 +4173,7 @@ void j_oconv(nkf_char c2, nkf_char c1)
         w16e_conv(c1, &c2, &c1);
         if (c2 == 0 && is_unicode_capsule(c1)){
            c2 = c1 & VALUE_MASK;
-           if (ms_ucs_map_f == UCS_MAP_CP932 &&
-               0xE000 <= c2 && c2 <= 0xE757) {
+           if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
                /* CP5022x UDC */
                c1 &= 0xFFF;
                c2 = 0x7F + c1 / 94;
@@ -4222,7 +4238,7 @@ void j_oconv(nkf_char c2, nkf_char c1)
         }
         (*o_putc)(c1);
     } else {
-       if(ms_ucs_map_f == UCS_MAP_CP932
+       if(ms_ucs_map_f
           ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
           : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
        if(x0213_f){
index 9e59956..e43ad55 100644 (file)
--- a/utf8tbl.c
+++ b/utf8tbl.c
@@ -7544,7 +7544,7 @@ const unsigned short cp932inv[2][189] = {
   0xFBFC, 0xFC40, 0xFC41, 0xFC42, 0xFC43, 0xFC44, 0xFC45, 0xFC46,
   0xFC47, 0xFC48, 0xFC49, 0xFC4A, 0xFC4B,      0,      0, 0xFA40,
   0xFA41, 0xFA42, 0xFA43, 0xFA44, 0xFA45, 0xFA46, 0xFA47, 0xFA48,
-  0xFA49,      0, 0xFA55, 0xFA56, 0xFA57,
+  0xFA49, 0x81CA, 0xFA55, 0xFA56, 0xFA57,
  },
 };
 #endif /* SHIFTJIS_CP932 */