Fix: -Z1 and -Z2 doesn't include -Z0. [nkf-forum#41992]

[nkf/nkf.git] / nkf.c
diff --git a/nkf.c b/nkf.c

index 56da48c..35aacff 100644 (file)
--- a/nkf.c
+++ b/nkf.c
@@ -2,26 +2,26 @@
   * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
   * Copyright (c) 1996-2009, The nkf Project.
   *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ *
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ *
+ * 3. This notice may not be removed or altered from any source distribution.
   */
-#define NKF_VERSION "2.0.8"
-#define NKF_RELEASE_DATE "2009-01-05"
+#define NKF_VERSION "2.0.9"
+#define NKF_RELEASE_DATE "2009-02-20"
  #define COPY_RIGHT \
      "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
      "Copyright (C) 1996-2009, The nkf Project."
@@ -501,7 +501,7 @@ static nkf_char
  no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
  {
      fprintf(stderr,"nkf internal module connection failure.\n");
-    exit(1);
+    exit(EXIT_FAILURE);
      return 0; /* LINT */
  }
  
@@ -802,7 +802,7 @@ nkf_default_encoding()
  typedef struct {
      long capa;
      long len;
-    unsigned char *ptr;
+    nkf_char *ptr;
  } nkf_buf_t;
  
  static nkf_buf_t *
@@ -815,12 +815,14 @@ nkf_buf_new(int length)
      return buf;
  } 
  
+#if 0
  static void
  nkf_buf_dispose(nkf_buf_t *buf)
  {
      nkf_xfree(buf->ptr);
      nkf_xfree(buf);
  }
+#endif
  
  #define nkf_buf_length(buf) ((buf)->len)
  #define nkf_buf_empty_p(buf) ((buf)->len == 0)
@@ -839,7 +841,7 @@ nkf_buf_clear(nkf_buf_t *buf)
  }
  
  static void
-nkf_buf_push(nkf_buf_t *buf, unsigned char c)
+nkf_buf_push(nkf_buf_t *buf, nkf_char c)
  {
      if (buf->capa <= buf->len) {
         exit(EXIT_FAILURE);
@@ -870,79 +872,61 @@ static void
  usage(void)
  {
      fprintf(HELP_OUTPUT,
-           "USAGE:  nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
-           "Flags:\n"
-           "b,u      Output is buffered (DEFAULT),Output is unbuffered\n"
-           "j,s,e,w  Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
+           "Usage:  nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
  #ifdef UTF8_OUTPUT_ENABLE
-           "         After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
+           " j/s/e/w  Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
+           "          UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
+#else
  #endif
-           "J,S,E,W  Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
  #ifdef UTF8_INPUT_ENABLE
-           "         After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
+           " J/S/E/W  Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
+           "          UTF option is -W[8,[16,32][B,L]]\n"
+#else
+           " J/S/E    Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
  #endif
-           "t        no conversion\n"
             );
      fprintf(HELP_OUTPUT,
-           "i[@B]    Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
-           "o[BJH]   Specify the Esc Seq for ASCII/Roman        (DEFAULT B)\n"
-           "r        {de/en}crypt ROT13/47\n"
-           "h        1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
-           "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
-           "M[BQ]    MIME encode [B:base64 Q:quoted]\n"
-           "l        ISO8859-1 (Latin-1) support\n"
-           "f/F      Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
+           " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
+           " M[BQ]    MIME encode [B:base64 Q:quoted]\n"
+           " f/F      Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
             );
      fprintf(HELP_OUTPUT,
-           "Z[0-4]   Default/0: Convert JISX0208 Alphabet to ASCII\n"
-           "         1: Kankaku to one space  2: to two spaces  3: HTML Entity\n"
-           "         4: JISX0208 Katakana to JISX0201 Katakana\n"
-           "X,x      Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
-           "B[0-2]   Broken input  0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
+           " Z[0-4]   Default/0: Convert JISX0208 Alphabet to ASCII\n"
+           "          1: Kankaku to one space  2: to two spaces  3: HTML Entity\n"
+           "          4: JISX0208 Katakana to JISX0201 Katakana\n"
+           " X,x      Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
             );
      fprintf(HELP_OUTPUT,
-#ifdef MSDOS
-           "T        Text mode output\n"
-#endif
-           "O        Output to File (DEFAULT 'nkf.out')\n"
-           "I        Convert non ISO-2022-JP charactor to GETA\n"
-           "d,c      Convert line breaks  -d: LF  -c: CRLF\n"
-           "-L[uwm]  line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
-           "v, V     Show this usage. V: show configuration\n"
-           "\n");
+           " O        Output to File (DEFAULT 'nkf.out')\n"
+           " L[uwm]   Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
+           );
      fprintf(HELP_OUTPUT,
-           "Long name options\n"
-           " --ic=<input codeset>  --oc=<output codeset>\n"
-           "                   Specify the input or output codeset\n"
-           " --fj  --unix --mac  --windows\n"
-           " --jis  --euc  --sjis  --utf8  --utf16  --mime  --base64\n"
-           "                   Convert for the system or code\n"
-           " --hiragana  --katakana  --katakana-hiragana\n"
-           "                   To Hiragana/Katakana Conversion\n"
-           " --prefix=         Insert escape before troublesome characters of Shift_JIS\n"
+           " --ic=<encoding>        Specify the input encoding\n"
+           " --oc=<encoding>        Specify the output encoding\n"
+           " --hiragana --katakana  Hiragana/Katakana Conversion\n"
+           " --katakana-hiragana    Converts each other\n"
             );
      fprintf(HELP_OUTPUT,
  #ifdef INPUT_OPTION
-           " --cap-input, --url-input  Convert hex after ':' or '%%'\n"
+           " --{cap, url}-input     Convert hex after ':' or '%%'\n"
  #endif
  #ifdef NUMCHAR_OPTION
-           " --numchar-input   Convert Unicode Character Reference\n"
+           " --numchar-input        Convert Unicode Character Reference\n"
  #endif
  #ifdef UTF8_INPUT_ENABLE
             " --fb-{skip, html, xml, perl, java, subchar}\n"
-           "                   Specify how nkf handles unassigned characters\n"
+           "                        Specify unassigned character's replacement\n"
  #endif
             );
      fprintf(HELP_OUTPUT,
  #ifdef OVERWRITE
-           " --in-place[=SUFFIX]  --overwrite[=SUFFIX]\n"
-           "                   Overwrite original listed files by filtered result\n"
-           "                   --overwrite preserves timestamp of original files\n"
+           " --in-place[=SUF]       Overwrite original files\n"
+           " --overwrite[=SUF]      Preserve timestamp of original files\n"
  #endif
-           " -g  --guess       Guess the input code\n"
-           " --help  --version Show this help/the version\n"
-           "                   For more information, see also man nkf\n"
-           "\n");
+           " -g --guess             Guess the input code\n"
+           " -v --version           Print the version\n"
+           " --help/-V              Print this help / configuration\n"
+           );
      version();
  }
  
@@ -1140,7 +1124,7 @@ static const struct {
      {"euc","e"},
      {"euc-input","E"},
      {"fj","jm"},
-    {"help","v"},
+    {"help",""},
      {"jis","j"},
      {"jis-input","J"},
      {"mac","sLm"},
@@ -1150,7 +1134,7 @@ static const struct {
      {"sjis","s"},
      {"sjis-input","S"},
      {"unix","eLu"},
-    {"version","V"},
+    {"version","v"},
      {"windows","sLw"},
      {"hiragana","h1"},
      {"katakana","h2"},
@@ -2999,6 +2983,8 @@ typedef struct {
      nkf_buf_t *std_gc_buf;
      nkf_char broken_state;
      nkf_buf_t *broken_buf;
+    nkf_char mimeout_state;
+    nkf_buf_t *nfc_buf;
  } nkf_state_t;
  
  static nkf_state_t *nkf_state = NULL;
@@ -3010,15 +2996,17 @@ nkf_state_init(void)
  {
      if (nkf_state) {
         nkf_buf_clear(nkf_state->std_gc_buf);
-       nkf_state->broken_state = 0;
         nkf_buf_clear(nkf_state->broken_buf);
+       nkf_buf_clear(nkf_state->nfc_buf);
      }
      else {
         nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
         nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
-       nkf_state->broken_state = 0;
         nkf_state->broken_buf = nkf_buf_new(3);
+       nkf_state->nfc_buf = nkf_buf_new(9);
      }
+    nkf_state->broken_state = 0;
+    nkf_state->mimeout_state = 0;
  }
  
  #ifndef WIN32DLL
@@ -3427,9 +3415,7 @@ fold_conv(nkf_char c2, nkf_char c1)
         f_prev = LF;
         f_line = 0;
         fold_state =  LF;            /* output newline and clear */
-    } else if ( (c2==0  && c1==SP)||
-              (c2==0  && c1==TAB)||
-              (c2=='!'&& c1=='!')) {
+    } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
         /* X0208 kankaku or ascii space */
         if (f_prev == SP) {
             fold_state = 0;         /* remove duplicate spaces */
@@ -4272,14 +4258,14 @@ nfc_getc(FILE *f)
  {
      nkf_char (*g)(FILE *f) = i_nfc_getc;
      nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
-    nkf_buf_t *buf = nkf_buf_new(9);
+    nkf_buf_t *buf = nkf_state->nfc_buf;
      const unsigned char *array;
      int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
      nkf_char c = (*g)(f);
  
      if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
  
-    nkf_buf_push(buf, (unsigned char)c);
+    nkf_buf_push(buf, c);
      do {
         while (lower <= upper) {
             int mid = (lower+upper) / 2;
@@ -4315,7 +4301,6 @@ nfc_getc(FILE *f)
  
      while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
      c = nkf_buf_pop(buf);
-    nkf_buf_dispose(buf);
  
      return c;
  }
@@ -4397,7 +4382,7 @@ mime_getc(FILE *f)
                 case LF:
                 case CR:
                     if (c1==LF) {
-                       if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
+                       if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
                             i_ungetc(SP,f);
                             continue;
                         } else {
@@ -4406,7 +4391,7 @@ mime_getc(FILE *f)
                         c1 = LF;
                     } else {
                         if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
-                           if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
+                           if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
                                 i_ungetc(SP,f);
                                 continue;
                             } else {
@@ -4496,7 +4481,7 @@ mime_getc(FILE *f)
             case LF:
             case CR:
                 if (c1==LF) {
-                   if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
+                   if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
                         i_ungetc(SP,f);
                         continue;
                     } else {
@@ -4508,7 +4493,7 @@ mime_getc(FILE *f)
                         if (c1==SP) {
                             i_ungetc(SP,f);
                             continue;
-                       } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
+                       } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
                             i_ungetc(SP,f);
                             continue;
                         } else {
@@ -4590,7 +4575,6 @@ static const char basis_64[] =
  static struct {
      char buf[MIMEOUT_BUF_LENGTH+1];
      int count;
-    nkf_char state;
  } mimeout_state;
  
  /*nkf_char mime_lastchar2, mime_lastchar1;*/
@@ -4618,15 +4602,12 @@ open_mime(nkf_char mode)
         PUT_NEWLINE((*o_mputc));
         (*o_mputc)(SP);
         base64_count = 1;
-       if (mimeout_state.count>0
-           && (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
-               || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)) {
+       if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
             i++;
         }
      }
      for (;i<mimeout_state.count;i++) {
-       if (mimeout_state.buf[i]==SP || mimeout_state.buf[i]==TAB
-           || mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF) {
+       if (nkf_isspace(mimeout_state.buf[i])) {
             (*o_mputc)(mimeout_state.buf[i]);
             base64_count ++;
         } else {
@@ -4694,13 +4675,13 @@ eof_mime(void)
      case 'B':
         break;
      case 2:
-       (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4)]);
+       (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
         (*o_mputc)('=');
         (*o_mputc)('=');
         base64_count += 3;
         break;
      case 1:
-       (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2)]);
+       (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
         (*o_mputc)('=');
         base64_count += 2;
         break;
@@ -4732,19 +4713,19 @@ mimeout_addchar(nkf_char c)
         }
         break;
      case 'B':
-       mimeout_state.state=c;
+       nkf_state->mimeout_state=c;
         (*o_mputc)(basis_64[c>>2]);
         mimeout_mode=2;
         base64_count ++;
         break;
      case 2:
-       (*o_mputc)(basis_64[((mimeout_state.state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
-       mimeout_state.state=c;
+       (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
+       nkf_state->mimeout_state=c;
         mimeout_mode=1;
         base64_count ++;
         break;
      case 1:
-       (*o_mputc)(basis_64[((mimeout_state.state & 0xF) << 2) | ((c & 0xC0) >>6)]);
+       (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
         (*o_mputc)(basis_64[c & 0x3F]);
         mimeout_mode='B';
         base64_count += 2;
@@ -4972,7 +4953,7 @@ mime_putc(nkf_char c)
                 mimeout_state.buf[mimeout_state.count++] = (char)c;
                 return;
             }
-           if (c==SP || c==TAB || c==CR || c==LF) {
+           if (nkf_isspace(c)) {
                 for (i=0;i<mimeout_state.count;i++) {
                     if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
                         eof_mime();
@@ -5863,6 +5844,12 @@ options(unsigned char *cp)
                 cp_back = cp;
                 cp = (unsigned char *)long_option[i].alias;
             }else{
+#ifndef PERL_XS
+               if (strcmp(long_option[i].name, "help") == 0){
+                   usage();
+                   exit(EXIT_SUCCESS);
+               }
+#endif
                 if (strcmp(long_option[i].name, "ic=") == 0){
                     enc = nkf_enc_find((char *)p);
                     if (!enc) continue;
@@ -6101,7 +6088,7 @@ options(unsigned char *cp)
             output_encoding = nkf_enc_from_index(EUCJP_NKF);
             continue;
         case 's':           /* SJIS output */
-           output_encoding = nkf_enc_from_index(WINDOWS_31J);
+           output_encoding = nkf_enc_from_index(SHIFT_JIS);
             continue;
         case 'l':           /* ISO8859 Latin-1 support, no conversion */
             iso8859_f = TRUE;  /* Only compatible with ISO-2022-JP */
@@ -6111,7 +6098,8 @@ options(unsigned char *cp)
             if (*cp=='@'||*cp=='B')
                 kanji_intro = *cp++;
             continue;
-       case 'o':           /* ASCII IN ESC-(-J/B */
+       case 'o':           /* ASCII IN ESC-(-J/B/H */
+           /* ESC ( H was used in initial JUNET messages */
             if (*cp=='J'||*cp=='B'||*cp=='H')
                 ascii_intro = *cp++;
             continue;
@@ -6136,11 +6124,11 @@ options(unsigned char *cp)
  #ifndef PERL_XS
         case 'V':
             show_configuration();
-           exit(1);
+           exit(EXIT_SUCCESS);
             break;
         case 'v':
-           usage();
-           exit(1);
+           version();
+           exit(EXIT_SUCCESS);
             break;
  #endif
  #ifdef UTF8_OUTPUT_ENABLE
@@ -6230,8 +6218,8 @@ options(unsigned char *cp)
         case 'E':   /* EUC-JP input */
             input_encoding = nkf_enc_from_index(EUCJP_NKF);
             continue;
-       case 'S':   /* Windows-31J input */
-           input_encoding = nkf_enc_from_index(WINDOWS_31J);
+       case 'S':   /* Shift_JIS input */
+           input_encoding = nkf_enc_from_index(SHIFT_JIS);
             continue;
         case 'Z':   /* Convert X0208 alphabet to asii */
             /* alpha_f
@@ -6244,6 +6232,7 @@ options(unsigned char *cp)
             while ('0'<= *cp && *cp <='9') {
                 alpha_f |= 1 << (*cp++ - '0');
             }
+           if (alpha_f & ((1 << 2) | (1 << 3))) alpha_f |= 1;
             if (!alpha_f) alpha_f = 1;
             continue;
         case 'x':   /* Convert X0201 kana to X0208 or X0201 Conversion */