OSDN Git Service

* Fix: can't output Shift_JIS UDC.
[nkf/nkf.git] / nkf.c
diff --git a/nkf.c b/nkf.c
index 565baae..8a4b71a 100644 (file)
--- a/nkf.c
+++ b/nkf.c
@@ -39,9 +39,9 @@
 **        E-Mail: furukawa@tcp-ip.or.jp
 **    \e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#\e(B
 ***********************************************************************/
-/* $Id: nkf.c,v 1.108 2006/09/15 08:06:14 naruse Exp $ */
+/* $Id: nkf.c,v 1.117 2006/11/04 14:35:25 naruse Exp $ */
 #define NKF_VERSION "2.0.8"
-#define NKF_RELEASE_DATE "2006-09-15"
+#define NKF_RELEASE_DATE "2006-11-04"
 #include "config.h"
 #include "utf8tbl.h"
 
@@ -581,8 +581,8 @@ struct input_code input_code_list[] = {
     {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
 #ifdef UTF8_INPUT_ENABLE
     {"UTF-8",     0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
-    {"UTF-16",    0, 0, 0, {0, 0, 0},     NULL, w_iconv16, 0},\r
-    {"UTF-32",    0, 0, 0, {0, 0, 0},     NULL, w_iconv32, 0},\r
+    {"UTF-16",    0, 0, 0, {0, 0, 0},     NULL, w_iconv16, 0},
+    {"UTF-32",    0, 0, 0, {0, 0, 0},     NULL, w_iconv32, 0},
 #endif
     {0}
 };
@@ -854,6 +854,7 @@ int main(int argc, char **argv)
       }
     } else {
       int nfiles = argc;
+       int is_argument_error = FALSE;
       while (argc--) {
            is_inputcode_mixed = FALSE;
            is_inputcode_set   = FALSE;
@@ -863,7 +864,9 @@ int main(int argc, char **argv)
 #endif
           if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
               perror(*--argv);
-              return(-1);
+               *argv++;
+               is_argument_error = TRUE;
+               continue;
           } else {
 #ifdef OVERWRITE
               int fd = 0;
@@ -1011,6 +1014,8 @@ int main(int argc, char **argv)
 #endif
           }
       }
+       if (is_argument_error)
+           return(-1);
     }
 #ifdef EASYWIN /*Easy Win */
     if (file_out_f == FALSE) 
@@ -1191,13 +1196,19 @@ void options(unsigned char *cp)
                        codeset[i] = nkf_toupper(p[i]);
                    }
                    codeset[i] = 0;
-                   if(strcmp(codeset, "ISO-2022-JP") == 0 ||
-                     strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
+                   if(strcmp(codeset, "ISO-2022-JP") == 0){
+                       input_f = JIS_INPUT;
+                   }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0 ||
                      strcmp(codeset, "CP50220") == 0 ||
                      strcmp(codeset, "CP50221") == 0 ||
-                     strcmp(codeset, "CP50222") == 0 ||
-                     strcmp(codeset, "ISO-2022-JP-MS") == 0){
+                     strcmp(codeset, "CP50222") == 0){
                        input_f = JIS_INPUT;
+#ifdef SHIFTJIS_CP932
+                       cp51932_f = TRUE;
+#endif
+#ifdef UTF8_OUTPUT_ENABLE
+                       ms_ucs_map_f = UCS_MAP_CP932;
+#endif
                    }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
                        input_f = JIS_INPUT;
 #ifdef X0212_ENABLE
@@ -1211,13 +1222,11 @@ void options(unsigned char *cp)
                        x0213_f = TRUE;
                    }else if(strcmp(codeset, "SHIFT_JIS") == 0){
                        input_f = SJIS_INPUT;
-                       if (x0201_f==NO_X0201) x0201_f=TRUE;
                    }else if(strcmp(codeset, "WINDOWS-31J") == 0 ||
                             strcmp(codeset, "CSWINDOWS31J") == 0 ||
                             strcmp(codeset, "CP932") == 0 ||
                             strcmp(codeset, "MS932") == 0){
                        input_f = SJIS_INPUT;
-                       x0201_f = FALSE;
 #ifdef SHIFTJIS_CP932
                        cp51932_f = TRUE;
 #endif
@@ -1229,7 +1238,6 @@ void options(unsigned char *cp)
                        input_f = EUC_INPUT;
                    }else if(strcmp(codeset, "CP51932") == 0){
                        input_f = EUC_INPUT;
-                       x0201_f = FALSE;
 #ifdef SHIFTJIS_CP932
                        cp51932_f = TRUE;
 #endif
@@ -1240,7 +1248,6 @@ void options(unsigned char *cp)
                             strcmp(codeset, "EUCJP-MS") == 0 ||
                             strcmp(codeset, "EUCJPMS") == 0){
                        input_f = EUC_INPUT;
-                       x0201_f = FALSE;
 #ifdef SHIFTJIS_CP932
                        cp51932_f = FALSE;
 #endif
@@ -1250,7 +1257,6 @@ void options(unsigned char *cp)
                    }else if(strcmp(codeset, "EUC-JP-ASCII") == 0 ||
                             strcmp(codeset, "EUCJP-ASCII") == 0){
                        input_f = EUC_INPUT;
-                       x0201_f = FALSE;
 #ifdef SHIFTJIS_CP932
                        cp51932_f = FALSE;
 #endif
@@ -1265,11 +1271,9 @@ void options(unsigned char *cp)
                        cp51932_f = FALSE;
                        cp932inv_f = FALSE;
 #endif
-                       if (x0201_f==NO_X0201) x0201_f=TRUE;
                    }else if(strcmp(codeset, "EUC-JISX0213") == 0 ||
                             strcmp(codeset, "EUC-JIS-2004") == 0){
                        input_f = EUC_INPUT;
-                       x0201_f = FALSE;
                        x0213_f = TRUE;
 #ifdef SHIFTJIS_CP932
                        cp51932_f = FALSE;
@@ -1313,16 +1317,35 @@ void options(unsigned char *cp)
                        codeset[i] = nkf_toupper(p[i]);
                    }
                    codeset[i] = 0;
-                   if(strcmp(codeset, "ISO-2022-JP") == 0 ||
-                      strcmp(codeset, "CP50220") == 0){
+                   if(strcmp(codeset, "ISO-2022-JP") == 0){
                        output_conv = j_oconv;
                    }else if(strcmp(codeset, "X-ISO2022JP-CP932") == 0){
                        output_conv = j_oconv;
+                       x0201_f = FALSE;
                        no_cp932ext_f = TRUE;
-                   }else if(strcmp(codeset, "CP50221") == 0 ||
-                            strcmp(codeset, "ISO-2022-JP-MS") == 0){
+#ifdef SHIFTJIS_CP932
+                       cp51932_f = TRUE;
+#endif
+#ifdef UTF8_OUTPUT_ENABLE
+                       ms_ucs_map_f = UCS_MAP_CP932;
+#endif
+                   }else if(strcmp(codeset, "CP50220") == 0){
+                       output_conv = j_oconv;
+#ifdef SHIFTJIS_CP932
+                       cp51932_f = TRUE;
+#endif
+#ifdef UTF8_OUTPUT_ENABLE
+                       ms_ucs_map_f = UCS_MAP_CP932;
+#endif
+                   }else if(strcmp(codeset, "CP50221") == 0){
                        output_conv = j_oconv;
                        x0201_f = FALSE;
+#ifdef SHIFTJIS_CP932
+                       cp51932_f = TRUE;
+#endif
+#ifdef UTF8_OUTPUT_ENABLE
+                       ms_ucs_map_f = UCS_MAP_CP932;
+#endif
                    }else if(strcmp(codeset, "ISO-2022-JP-1") == 0){
                        output_conv = j_oconv;
 #ifdef X0212_ENABLE
@@ -2233,10 +2256,10 @@ void code_status(nkf_char c)
     struct input_code *result = 0;
     struct input_code *p = input_code_list;
     while (p->name){
-        if (!p->status_func) {\r
-           ++p;\r
-           continue;\r
-       }\r
+        if (!p->status_func) {
+           ++p;
+           continue;
+       }
         if (!p->status_func)
            continue;
         (p->status_func)(p, c);
@@ -2419,8 +2442,11 @@ void check_bom(FILE *f)
                    if(!input_f){
                        set_iconv(TRUE, w_iconv32);
                    }
-                   input_endian = ENDIAN_BIG;
-                   return;
+                   if (iconv == w_iconv32) {
+                       input_endian = ENDIAN_BIG;
+                       return;
+                   }
+                   (*i_ungetc)(0xFF,f);
                }else (*i_ungetc)(c2,f);
                (*i_ungetc)(0xFE,f);
            }else if(c2 == 0xFF){
@@ -2428,8 +2454,11 @@ void check_bom(FILE *f)
                    if(!input_f){
                        set_iconv(TRUE, w_iconv32);
                    }
-                   input_endian = ENDIAN_2143;
-                   return;
+                   if (iconv == w_iconv32) {
+                       input_endian = ENDIAN_2143;
+                       return;
+                   }
+                   (*i_ungetc)(0xFF,f);
                }else (*i_ungetc)(c2,f);
                (*i_ungetc)(0xFF,f);
            }else (*i_ungetc)(c2,f);
@@ -2443,7 +2472,10 @@ void check_bom(FILE *f)
                if(!input_f){
                    set_iconv(TRUE, w_iconv);
                }
-               return;
+               if (iconv == w_iconv) {
+                   return;
+               }
+               (*i_ungetc)(0xBF,f);
            }else (*i_ungetc)(c2,f);
            (*i_ungetc)(0xBB,f);
        }else (*i_ungetc)(c2,f);
@@ -2456,16 +2488,22 @@ void check_bom(FILE *f)
                    if(!input_f){
                        set_iconv(TRUE, w_iconv32);
                    }
-                   input_endian = ENDIAN_3412;
-                   return;
+                   if (iconv == w_iconv32) {
+                       input_endian = ENDIAN_3412;
+                       return;
+                   }
+                   (*i_ungetc)(0x00,f);
                }else (*i_ungetc)(c2,f);
                (*i_ungetc)(0x00,f);
            }else (*i_ungetc)(c2,f);
            if(!input_f){
                set_iconv(TRUE, w_iconv16);
            }
-           input_endian = ENDIAN_BIG;
-           return;
+           if (iconv == w_iconv16) {
+               input_endian = ENDIAN_BIG;
+               return;
+           }
+           (*i_ungetc)(0xFF,f);
        }else (*i_ungetc)(c2,f);
        (*i_ungetc)(0xFE,f);
        break;
@@ -2476,16 +2514,22 @@ void check_bom(FILE *f)
                    if(!input_f){
                        set_iconv(TRUE, w_iconv32);
                    }
-                   input_endian = ENDIAN_LITTLE;
-                   return;
+                   if (iconv == w_iconv32) {
+                       input_endian = ENDIAN_LITTLE;
+                       return;
+                   }
+                   (*i_ungetc)(0x00,f);
                }else (*i_ungetc)(c2,f);
                (*i_ungetc)(0x00,f);
            }else (*i_ungetc)(c2,f);
            if(!input_f){
                set_iconv(TRUE, w_iconv16);
            }
-           input_endian = ENDIAN_LITTLE;
-           return;
+           if (iconv == w_iconv16) {
+               input_endian = ENDIAN_LITTLE;
+               return;
+           }
+           (*i_ungetc)(0xFE,f);
        }else (*i_ungetc)(c2,f);
        (*i_ungetc)(0xFF,f);
        break;
@@ -2540,14 +2584,16 @@ nkf_char kanji_convert(FILE *f)
                     else 
                         c2 = 0;
                     NEXT;
-                } else
-                    /* in case of already established */
-                    if (c1 < AT) {
-                        /* ignore bogus code */
-                        c2 = 0;
-                        NEXT;
-                    } else
-                        SEND;
+                } else {
+                   /* in case of already established */
+                   if (c1 < AT && !(X0208 && 0x80 <= c2 && c2 <= 0x92)) {
+                       /* ignore bogus code and not CP5022x UCD */
+                       c2 = 0;
+                       NEXT;
+                   } else {
+                       SEND;
+                   }
+               }
             } else
                 /* second byte, 7 bit code */
                 /* it might be kanji shitfted */
@@ -2569,10 +2615,10 @@ nkf_char kanji_convert(FILE *f)
                                c0 <<= 8;
                                if ((c3 = (*i_getc)(f)) != EOF) {
                                    c0 |= c3;
-                               } else c1 = EOF;
-                           } else c1 = EOF;
+                               } else c2 = EOF;
+                           } else c2 = EOF;
                        }
-                   }
+                   } else c2 = EOF;
                } else {
                    if ((c2 = (*i_getc)(f)) != EOF) {
                        if (0xD8 <= c2 && c2 <= 0xDB) {
@@ -2580,10 +2626,10 @@ nkf_char kanji_convert(FILE *f)
                                if ((c0 = (*i_getc)(f)) != EOF) {
                                    c0 <<= 8;
                                    c0 |= c3;
-                               } else c1 = EOF;
-                           } else c1 = EOF;
+                               } else c2 = EOF;
+                           } else c2 = EOF;
                        }
-                   } else c1 = EOF;
+                   } else c2 = EOF;
                }
                SEND;
             } else if(iconv == w_iconv32){
@@ -2607,7 +2653,7 @@ nkf_char kanji_convert(FILE *f)
                    }
                    c2 = 0;
                }else{
-                   c1 = EOF;
+                   c2 = EOF;
                }
                SEND;
             } else
@@ -2825,6 +2871,44 @@ nkf_char kanji_convert(FILE *f)
                     (*oconv)(0, ESC);
                     SEND;
                 }
+           } else if (c1 == ESC && iconv == s_iconv) {
+               /* ESC in Shift_JIS */
+               if ((c1 = (*i_getc)(f)) == EOF) {
+                   /*  (*oconv)(0, ESC); don't send bogus code */
+                   LAST;
+               } else if (c1 == '$') {
+                   /* J-PHONE emoji */
+                   if ((c1 = (*i_getc)(f)) == EOF) {
+                       /*
+                          (*oconv)(0, ESC); don't send bogus code 
+                          (*oconv)(0, '$'); */
+                       LAST;
+                   } else {
+                       if (('E' <= c1 && c1 <= 'G') ||
+                           ('O' <= c1 && c1 <= 'Q')) {
+                           /*
+                              NUM : 0 1 2 3 4 5
+                              BYTE: G E F O P Q
+                              C%7 : 1 6 0 2 3 4
+                              C%7 : 0 1 2 3 4 5 6
+                              NUM : 2 0 3 4 5 X 1
+                            */
+                           static const int jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
+                           c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SPACE + 0xE000 + CLASS_UNICODE;
+                           while ((c1 = (*i_getc)(f)) != EOF) {
+                               if (SPACE <= c1 && c1 <= 'z') {
+                                   (*oconv)(0, c1 + c0);
+                               } else break; /* c1 == SO */
+                           }
+                       }
+                   }
+                   if (c1 == EOF) LAST;
+                   NEXT;
+               } else {
+                   /* lonely ESC  */
+                   (*oconv)(0, ESC);
+                   SEND;
+               }
             } else if ((c1 == NL || c1 == CR) && broken_f&4) {
                 input_mode = ASCII; set_iconv(FALSE, 0);
                 SEND;
@@ -2854,6 +2938,10 @@ nkf_char kanji_convert(FILE *f)
                }
                c1 = CR;
                SEND;
+           } else if (c1 == DEL && input_mode == X0208 ) {
+               /* CP5022x */
+               c2 = c1;
+               NEXT;
            } else 
                 SEND;
         }
@@ -2883,6 +2971,14 @@ nkf_char kanji_convert(FILE *f)
            break;
        case X0208:
        case X0213_1:
+           if (ms_ucs_map_f == UCS_MAP_CP932 &&
+               0x7F <= c2 && c2 <= 0x92 &&
+               0x21 <= c1 && c1 <= 0x7E) {
+               /* CP932 UDC */
+               if(c1 == 0x7F) return 0;
+               c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
+               c2 = 0;
+           }
            (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
            break;
 #ifdef X0212_ENABLE
@@ -3116,6 +3212,13 @@ nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
        c1 &= 0x7f;
     } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
         /* NOP */
+    } else if (ms_ucs_map_f == UCS_MAP_CP932 &&
+              0xF0 <= c2 && c2 <= 0xF9 &&
+              0x40 <= c1 && c1 <= 0xFC) {
+       /* CP932 UDC */
+       if(c1 == 0x7F) return 0;
+       c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
+       c2 = 0;
     } else {
         nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
         if (ret) return ret;
@@ -3133,20 +3236,26 @@ nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
         if (c0 == 0){
             return -1;
         }
-        c2 = (c2 << 8) | (c1 & 0x7f);
-        c1 = c0 & 0x7f;
+       if (ms_ucs_map_f == UCS_MAP_MS && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
+           /* encoding is eucJP-ms, so invert to Unicode Private User Area */
+           c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
+           c2 = 0;
+       } else {
+           c2 = (c2 << 8) | (c1 & 0x7f);
+           c1 = c0 & 0x7f;
 #ifdef SHIFTJIS_CP932
-        if (cp51932_f){
-            nkf_char s2, s1;
-            if (e2s_conv(c2, c1, &s2, &s1) == 0){
-                s2e_conv(s2, s1, &c2, &c1);
-                if (c2 < 0x100){
-                    c1 &= 0x7f;
-                    c2 &= 0x7f;
-                }
-            }
-        }
+           if (cp51932_f){
+               nkf_char s2, s1;
+               if (e2s_conv(c2, c1, &s2, &s1) == 0){
+                   s2e_conv(s2, s1, &c2, &c1);
+                   if (c2 < 0x100){
+                       c1 &= 0x7f;
+                       c2 &= 0x7f;
+                   }
+               }
+           }
 #endif /* SHIFTJIS_CP932 */
+        }
 #endif /* X0212_ENABLE */
     } else if (c2 == SSO){
         c2 = X0201;
@@ -3154,8 +3263,14 @@ nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
     } else if ((c2 == EOF) || (c2 == 0) || c2 < SPACE) {
         /* NOP */
     } else {
-        c1 &= 0x7f;
-        c2 &= 0x7f;
+       if (ms_ucs_map_f == UCS_MAP_MS && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
+           /* encoding is eucJP-ms, so invert to Unicode Private User Area */
+           c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
+           c2 = 0;
+       } else {
+           c1 &= 0x7f;
+           c2 &= 0x7f;
+        }
     }
     (*oconv)(c2, c1);
     return 0;
@@ -3772,6 +3887,7 @@ void w_oconv16(nkf_char c2, nkf_char c1)
         nkf_char val = e2w_conv(c2, c1);
         c2 = (val >> 8) & 0xff;
         c1 = val & 0xff;
+       if (!val) return;
     }
     if (output_endian == ENDIAN_LITTLE){
         (*o_putc)(c1);
@@ -3812,6 +3928,7 @@ void w_oconv32(nkf_char c2, nkf_char c1)
 #endif
     } else if (c2) {
         c1 = e2w_conv(c2, c1);
+       if (!c1) return;
     }
     if (output_endian == ENDIAN_LITTLE){
         (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
@@ -3833,8 +3950,18 @@ void e_oconv(nkf_char c2, nkf_char c1)
     if (c2 == 0 && is_unicode_capsule(c1)){
         w16e_conv(c1, &c2, &c1);
         if (c2 == 0 && is_unicode_capsule(c1)){
-           if(encode_fallback)(*encode_fallback)(c1);
-            return;
+           c2 = c1 & VALUE_MASK;
+           if (ms_ucs_map_f == UCS_MAP_MS &&
+               0xE000 <= c2 && c2 <= 0xE757) {
+               /* eucJP-ms UDC */
+               c1 &= 0xFFF;
+               c2 = c1 / 94;
+               c2 += c2 < 10 ? 0x75 : 0x8FEB;
+               c1 = 0x21 + c1 % 94;
+           } else {
+               if (encode_fallback) (*encode_fallback)(c1);
+               return;
+           }
         }
     }
 #endif
@@ -3967,9 +4094,22 @@ void s_oconv(nkf_char c2, nkf_char c1)
     if (c2 == 0 && is_unicode_capsule(c1)){
         w16e_conv(c1, &c2, &c1);
         if (c2 == 0 && is_unicode_capsule(c1)){
-           if(encode_fallback)(*encode_fallback)(c1);
-            return;
-        }
+           c2 = c1 & VALUE_MASK;
+           if (ms_ucs_map_f == UCS_MAP_CP932 &&
+               0xE000 <= c2 && c2 <= 0xE757) {
+               /* CP932 UDC */
+               c1 &= 0xFFF;
+               c2 = c1 / 188 + 0xF0;
+               c1 = c1 % 188;
+               c1 += 0x40 + (c1 > 0x3e);
+               (*o_putc)(c2);
+               (*o_putc)(c1);
+               return;
+           } else {
+               if(encode_fallback)(*encode_fallback)(c1);
+               return;
+           }
+       }
     }
 #endif
     if (c2 == EOF) {
@@ -4028,8 +4168,17 @@ void j_oconv(nkf_char c2, nkf_char c1)
     if (c2 == 0 && is_unicode_capsule(c1)){
         w16e_conv(c1, &c2, &c1);
         if (c2 == 0 && is_unicode_capsule(c1)){
-           if(encode_fallback)(*encode_fallback)(c1);
-            return;
+           c2 = c1 & VALUE_MASK;
+           if (ms_ucs_map_f == UCS_MAP_CP932 &&
+               0xE000 <= c2 && c2 <= 0xE757) {
+               /* CP5022x UDC */
+               c1 &= 0xFFF;
+               c2 = 0x7F + c1 / 94;
+               c1 = 0x21 + c1 % 94;
+           } else {
+               if (encode_fallback) (*encode_fallback)(c1);
+               return;
+           }
         }
     }
 #endif
@@ -4086,7 +4235,9 @@ void j_oconv(nkf_char c2, nkf_char c1)
         }
         (*o_putc)(c1);
     } else {
-       if(c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
+       if(ms_ucs_map_f == UCS_MAP_CP932
+          ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
+          : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
        if(x0213_f){
            if (output_mode!=X0213_1) {
                output_mode = X0213_1;