X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=nkf.c;h=6877afe2374925c01f6ea480989933506c006daa;hb=71012057801fe0cdcf0ab9a136c7259e5409ca0e;hp=e907c65e5ad1698ff1366600ce9f24f2c4c707c5;hpb=1a4e12daeafd2e07084195f4a6bfd4409cc88d1e;p=nkf%2Fnkf.git diff --git a/nkf.c b/nkf.c index e907c65..6877afe 100644 --- a/nkf.c +++ b/nkf.c @@ -1,209 +1,62 @@ -/** Network Kanji Filter. (PDS Version) -************************************************************************ -** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA) -** 連絡先: (株)富士通研究所 ソフト3研 市川 至 -** (E-Mail Address: ichikawa@flab.fujitsu.co.jp) -** Copyright (C) 1996,1998 -** Copyright (C) 2002 -** 連絡先: 琉球大学情報工学科 河野 真治 mime/X0208 support -** (E-Mail Address: kono@ie.u-ryukyu.ac.jp) -** 連絡先: COW for DOS & Win16 & Win32 & OS/2 -** (E-Mail Address: GHG00637@niftyserve.or.p) -** -** このソースのいかなる複写,改変,修正も許諾します。ただし、 -** その際には、誰が貢献したを示すこの部分を残すこと。 -** 再配布や雑誌の付録などの問い合わせも必要ありません。 -** 営利利用も上記に反しない範囲で許可します。 -** バイナリの配布の際にはversion messageを保存することを条件とします。 -** このプログラムについては特に何の保証もしない、悪しからず。 -** -** Everyone is permitted to do anything on this program -** including copying, modifying, improving, -** as long as you don't try to pretend that you wrote it. -** i.e., the above copyright notice has to appear in all copies. -** Binary distribution requires original version messages. -** You don't have to ask before copying, redistribution or publishing. -** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE. -***********************************************************************/ - -/*********************************************************************** - * 現在、nkf は SorceForge にてメンテナンスが続けられています。 - * http://sourceforge.jp/projects/nkf/ -***********************************************************************/ -/* $Id: nkf.c,v 1.162 2008/01/01 14:21:20 naruse Exp $ */ -#define NKF_VERSION "2.0.8" -#define NKF_RELEASE_DATE "2007-01-02" +/* + * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA). + * Copyright (c) 1996-2010, The nkf Project. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * + * 3. This notice may not be removed or altered from any source distribution. + */ +#define NKF_VERSION "2.1.1" +#define NKF_RELEASE_DATE "2010-08-08" #define COPY_RIGHT \ - "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \ - "Copyright (C) 2002-2007 Kono, Furukawa, Naruse, mastodon" + "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \ + "Copyright (C) 1996-2010, The nkf Project." #include "config.h" +#include "nkf.h" #include "utf8tbl.h" - -#if defined(DEFAULT_CODE_JIS) -#elif defined(DEFAULT_CODE_SJIS) -#elif defined(DEFAULT_CODE_EUC) -#elif defined(DEFAULT_CODE_UTF8) -#else -#define DEFAULT_CODE_JIS 1 -#endif - -#ifndef MIME_DECODE_DEFAULT -#define MIME_DECODE_DEFAULT STRICT_MIME -#endif -#ifndef X0201_DEFAULT -#define X0201_DEFAULT TRUE -#endif - -#if DEFAULT_NEWLINE == 0x0D0A -#define PUT_NEWLINE(func) do {\ - func(0x0D);\ - func(0x0A);\ -} while (0) -#define OCONV_NEWLINE(func) do {\ - func(0, 0x0D);\ - func(0, 0x0A);\ -} while (0) -#elif DEFAULT_NEWLINE == 0x0D -#define PUT_NEWLINE(func) func(0x0D) -#define OCONV_NEWLINE(func) func(0, 0x0D) -#else -#define DEFAULT_NEWLINE 0x0A -#define PUT_NEWLINE(func) func(0x0A) -#define OCONV_NEWLINE(func) func(0, 0x0A) -#endif -#ifdef HELP_OUTPUT_STDERR -#define HELP_OUTPUT stderr -#else -#define HELP_OUTPUT stdout -#endif - -#if (defined(__TURBOC__) || defined(_MSC_VER) || defined(LSI_C) || defined(__MINGW32__) || defined(__EMX__) || defined(__MSDOS__) || defined(__WINDOWS__) || defined(__DOS__) || defined(__OS2__)) && !defined(MSDOS) -#define MSDOS -#if (defined(__Win32__) || defined(_WIN32)) && !defined(__WIN32__) -#define __WIN32__ -#endif -#endif - -#ifdef PERL_XS -#undef OVERWRITE -#endif - -#ifndef PERL_XS -#include -#endif - -#include -#include - -#if defined(MSDOS) || defined(__OS2__) -#include -#include -#if defined(_MSC_VER) || defined(__WATCOMC__) -#define mktemp _mktemp -#endif -#endif - -#ifdef MSDOS -#ifdef LSI_C -#define setbinmode(fp) fsetbin(fp) -#elif defined(__DJGPP__) -#include -#define setbinmode(fp) djgpp_setbinmode(fp) -#else /* Microsoft C, Turbo C */ -#define setbinmode(fp) setmode(fileno(fp), O_BINARY) -#endif -#else /* UNIX */ -#define setbinmode(fp) -#endif - -#if defined(__DJGPP__) -void djgpp_setbinmode(FILE *fp) -{ - /* we do not use libc's setmode(), which changes COOKED/RAW mode in device. */ - int fd, m; - fd = fileno(fp); - m = (__file_handle_modes[fd] & (~O_TEXT)) | O_BINARY; - __file_handle_set(fd, m); -} -#endif - -#ifdef _IOFBF /* SysV and MSDOS, Windows */ -#define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size) -#else /* BSD */ -#define setvbuffer(fp, buf, size) setbuffer(fp, buf, size) -#endif - -/*Borland C++ 4.5 EasyWin*/ -#if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */ -#define EASYWIN -#ifndef __WIN16__ -#define __WIN16__ -#endif -#include -#endif - -#ifdef OVERWRITE -/* added by satoru@isoternet.org */ -#if defined(__EMX__) -#include -#endif -#include -#if !defined(MSDOS) || defined(__DJGPP__) /* UNIX, djgpp */ -#include -#if defined(__WATCOMC__) -#include -#else -#include -#endif -#else /* defined(MSDOS) */ #ifdef __WIN32__ -#ifdef __BORLANDC__ /* BCC32 */ -#include -#else /* !defined(__BORLANDC__) */ -#include -#endif /* (__BORLANDC__) */ -#else /* !defined(__WIN32__) */ -#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__WATCOMC__) || defined(__OS2__) || defined(__EMX__) || defined(__IBMC__) || defined(__IBMCPP__) /* VC++, MinGW, Watcom, emx+gcc, IBM VAC++ */ -#include -#elif defined(__TURBOC__) /* BCC */ -#include -#elif defined(LSI_C) /* LSI C */ -#endif /* (__WIN32__) */ -#endif +#include +#include #endif +#if defined(__OS2__) +# define INCL_DOS +# define INCL_DOSERRORS +# include #endif +#include -#define FALSE 0 -#define TRUE 1 /* state of output_mode and input_mode c2 0 means ASCII - JIS_X_0201 - ISO_8859_1 - JIS_X_0208 - EOF all termination + JIS_X_0201_1976_K + ISO_8859_1 + JIS_X_0208 + EOF all termination c1 32bit data */ -/* Input Assumption */ - -#define JIS_INPUT 4 -#define EUC_INPUT 16 -#define SJIS_INPUT 5 -#define LATIN1_INPUT 6 -#define UTF8_INPUT 13 -#define UTF16_INPUT 1015 -#define UTF32_INPUT 1017 +/* MIME ENCODE */ #define FIXED_MIME 7 #define STRICT_MIME 8 -/* MIME ENCODE */ - - /* byte order */ enum byte_order { ENDIAN_BIG = 1, @@ -220,12 +73,10 @@ enum byte_order { #define CR 0x0d #define ESC 0x1b #define SP 0x20 -#define AT 0x40 -#define SSP 0xa0 #define DEL 0x7f #define SI 0x0f #define SO 0x0e -#define SSO 0x8e +#define SS2 0x8e #define SS3 0x8f #define CRLF 0x0D0A @@ -241,10 +92,12 @@ enum nkf_encodings { CP50222, ISO_2022_JP_1, ISO_2022_JP_3, + ISO_2022_JP_2004, SHIFT_JIS, WINDOWS_31J, CP10001, EUC_JP, + EUCJP_NKF, CP51932, EUCJP_MS, EUCJP_ASCII, @@ -266,25 +119,30 @@ enum nkf_encodings { UTF_32BE_BOM, UTF_32LE, UTF_32LE_BOM, - JIS_X_0201=0x1000, - JIS_X_0208=0x1001, - JIS_X_0212=0x1002, - JIS_X_0213_1=0x1003, - JIS_X_0213_2=0x1004, - BINARY + BINARY, + NKF_ENCODING_TABLE_SIZE, + JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */ + /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */ + /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */ + /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */ + JIS_X_0208 = 0x1168, /* @B */ + JIS_X_0212 = 0x1159, /* D */ + /* JIS_X_0213_2000_1 = 0x1228, */ /* O */ + JIS_X_0213_2 = 0x1229, /* P */ + JIS_X_0213_1 = 0x1233 /* Q */ }; -nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0); -nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0); -nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0); -nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0); -nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0); -void j_oconv(nkf_char c2, nkf_char c1); -void s_oconv(nkf_char c2, nkf_char c1); -void e_oconv(nkf_char c2, nkf_char c1); -void w_oconv(nkf_char c2, nkf_char c1); -void w_oconv16(nkf_char c2, nkf_char c1); -void w_oconv32(nkf_char c2, nkf_char c1); +static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0); +static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0); +static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0); +static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0); +static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0); +static void j_oconv(nkf_char c2, nkf_char c1); +static void s_oconv(nkf_char c2, nkf_char c1); +static void e_oconv(nkf_char c2, nkf_char c1); +static void w_oconv(nkf_char c2, nkf_char c1); +static void w_oconv16(nkf_char c2, nkf_char c1); +static void w_oconv32(nkf_char c2, nkf_char c1); typedef struct { const char *name; @@ -292,7 +150,7 @@ typedef struct { void (*oconv)(nkf_char c2, nkf_char c1); } nkf_native_encoding; -nkf_native_encoding NkfEncodingASCII = { "US_ASCII", e_iconv, e_oconv }; +nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv }; nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv }; nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv }; nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv }; @@ -305,8 +163,9 @@ typedef struct { const char *name; const nkf_native_encoding *base_encoding; } nkf_encoding; + nkf_encoding nkf_encoding_table[] = { - {ASCII, "ASCII", &NkfEncodingASCII}, + {ASCII, "US-ASCII", &NkfEncodingASCII}, {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII}, {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP}, {CP50220, "CP50220", &NkfEncodingISO_2022_JP}, @@ -314,10 +173,12 @@ nkf_encoding nkf_encoding_table[] = { {CP50222, "CP50222", &NkfEncodingISO_2022_JP}, {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP}, {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP}, + {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP}, {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS}, {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS}, {CP10001, "CP10001", &NkfEncodingShift_JIS}, {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP}, + {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP}, {CP51932, "CP51932", &NkfEncodingEUC_JP}, {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP}, {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP}, @@ -342,21 +203,28 @@ nkf_encoding nkf_encoding_table[] = { {BINARY, "BINARY", &NkfEncodingASCII}, {-1, NULL, NULL} }; -#define NKF_ENCODING_TABLE_SIZE 34 + struct { const char *name; const int id; } encoding_name_to_id_table[] = { + {"US-ASCII", ASCII}, {"ASCII", ASCII}, + {"646", ASCII}, + {"ROMAN8", ASCII}, {"ISO-2022-JP", ISO_2022_JP}, - {"X-ISO2022JP-CP932", CP50220}, + {"ISO2022JP-CP932", CP50220}, {"CP50220", CP50220}, {"CP50221", CP50221}, + {"CSISO2022JP", CP50221}, {"CP50222", CP50222}, {"ISO-2022-JP-1", ISO_2022_JP_1}, {"ISO-2022-JP-3", ISO_2022_JP_3}, + {"ISO-2022-JP-2004", ISO_2022_JP_2004}, {"SHIFT_JIS", SHIFT_JIS}, {"SJIS", SHIFT_JIS}, + {"MS_Kanji", SHIFT_JIS}, + {"PCK", SHIFT_JIS}, {"WINDOWS-31J", WINDOWS_31J}, {"CSWINDOWS31J", WINDOWS_31J}, {"CP932", WINDOWS_31J}, @@ -364,6 +232,7 @@ struct { {"CP10001", CP10001}, {"EUCJP", EUC_JP}, {"EUC-JP", EUC_JP}, + {"EUCJP-NKF", EUCJP_NKF}, {"CP51932", CP51932}, {"EUC-JP-MS", EUCJP_MS}, {"EUCJP-MS", EUCJP_MS}, @@ -392,19 +261,22 @@ struct { {"BINARY", BINARY}, {NULL, -1} }; + #if defined(DEFAULT_CODE_JIS) -#define DEFAULT_ENCODING ISO_2022_JP +#define DEFAULT_ENCIDX ISO_2022_JP #elif defined(DEFAULT_CODE_SJIS) -#define DEFAULT_ENCODING SHIFT_JIS +#define DEFAULT_ENCIDX SHIFT_JIS +#elif defined(DEFAULT_CODE_WINDOWS_31J) +#define DEFAULT_ENCIDX WINDOWS_31J #elif defined(DEFAULT_CODE_EUC) -#define DEFAULT_ENCODING EUC_JP +#define DEFAULT_ENCIDX EUC_JP #elif defined(DEFAULT_CODE_UTF8) -#define DEFAULT_ENCODING UTF_8 +#define DEFAULT_ENCIDX UTF_8 #endif #define is_alnum(c) \ - (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9')) + (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9')) /* I don't trust portablity of toupper */ #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c) @@ -418,19 +290,16 @@ struct { #define nkf_isprint(c) (SP<=c && c<='~') #define nkf_isgraph(c) ('!'<=c && c<='~') #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \ - ('A'<=c&&c<='F') ? (c-'A'+10) : \ - ('a'<=c&&c<='f') ? (c-'a'+10) : 0) + ('A'<=c&&c<='F') ? (c-'A'+10) : \ + ('a'<=c&&c<='f') ? (c-'a'+10) : 0) #define bin2hex(c) ("0123456789ABCDEF"[c&15]) #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3) #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \ - ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \ - && (c != '(') && (c != ')') && (c != '.') && (c != 0x22))) + ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \ + && (c != '(') && (c != ')') && (c != '.') && (c != 0x22))) -#define CP932_TABLE_BEGIN 0xFA -#define CP932_TABLE_END 0xFC -#define CP932INV_TABLE_BEGIN 0xED -#define CP932INV_TABLE_END 0xEE #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END) +#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F) #define HOLD_SIZE 1024 #if defined(INT_IS_SHORT) @@ -442,23 +311,11 @@ struct { #define DEFAULT_J 'B' #define DEFAULT_R 'B' -#define SJ0162 0x00e1 /* 01 - 62 ku offset */ -#define SJ6394 0x0161 /* 63 - 94 ku offset */ -#define RANGE_NUM_MAX 18 #define GETA1 0x22 #define GETA2 0x2e -#if defined(UTF8_OUTPUT_ENABLE) || defined(UTF8_INPUT_ENABLE) -#define sizeof_euc_to_utf8_1byte 94 -#define sizeof_euc_to_utf8_2bytes 94 -#define sizeof_utf8_to_euc_C2 64 -#define sizeof_utf8_to_euc_E5B8 64 -#define sizeof_utf8_to_euc_2bytes 112 -#define sizeof_utf8_to_euc_3bytes 16 -#endif - /* MIME preprocessor */ #ifdef EASYWIN /*Easy Win */ @@ -466,7 +323,7 @@ extern POINT _BufferSize; #endif struct input_code{ - char *name; + const char *name; nkf_char stat; nkf_char score; nkf_char index; @@ -476,19 +333,10 @@ struct input_code{ int _file_stat; }; -static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */ +static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */ static nkf_encoding *input_encoding = NULL; static nkf_encoding *output_encoding = NULL; -#if !defined(PERL_XS) && !defined(WIN32DLL) -static nkf_char noconvert(FILE *f); -#endif -static void module_connection(void); -static nkf_char kanji_convert(FILE *f); -static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1); -static nkf_char push_hold_buf(nkf_char c2); -static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0)); -static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1); #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) /* UCS Mapping * 0: Shift_JIS, eucJP-ascii @@ -509,39 +357,13 @@ static int no_cp932ext_f = FALSE; static int no_best_fit_chars_f = FALSE; static int input_endian = ENDIAN_BIG; static nkf_char unicode_subchar = '?'; /* the regular substitution character */ -static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c); -static void encode_fallback_html(nkf_char c); -static void encode_fallback_xml(nkf_char c); -static void encode_fallback_java(nkf_char c); -static void encode_fallback_perl(nkf_char c); -static void encode_fallback_subchar(nkf_char c); static void (*encode_fallback)(nkf_char c) = NULL; -static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1); -static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1); -static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1); -static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0); -static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0); -static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1); static void w_status(struct input_code *, nkf_char); #endif #ifdef UTF8_OUTPUT_ENABLE static int output_bom_f = FALSE; static int output_endian = ENDIAN_BIG; -static nkf_char e2w_conv(nkf_char c2,nkf_char c1); -#endif -static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1); -static void fold_conv(nkf_char c2,nkf_char c1); -static void nl_conv(nkf_char c2,nkf_char c1); -static void z_conv(nkf_char c2,nkf_char c1); -static void rot_conv(nkf_char c2,nkf_char c1); -static void hira_conv(nkf_char c2,nkf_char c1); -static void base64_conv(nkf_char c2,nkf_char c1); -static void iso2022jp_check_conv(nkf_char c2,nkf_char c1); -static void no_connection(nkf_char c2,nkf_char c1); -static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0); - -static void code_score(struct input_code *ptr); -static void code_status(nkf_char c); +#endif static void std_putc(nkf_char c); static nkf_char std_getc(FILE *f); @@ -550,31 +372,9 @@ static nkf_char std_ungetc(nkf_char c,FILE *f); static nkf_char broken_getc(FILE *f); static nkf_char broken_ungetc(nkf_char c,FILE *f); -static nkf_char mime_begin(FILE *f); static nkf_char mime_getc(FILE *f); -static nkf_char mime_ungetc(nkf_char c,FILE *f); - -static void switch_mime_getc(void); -static void unswitch_mime_getc(void); -static nkf_char mime_begin_strict(FILE *f); -static nkf_char mime_getc_buf(FILE *f); -static nkf_char mime_ungetc_buf(nkf_char c,FILE *f); -static nkf_char mime_integrity(FILE *f,const unsigned char *p); - -static nkf_char base64decode(nkf_char c); -static void mime_prechar(nkf_char c2, nkf_char c1); -static void mime_putc(nkf_char c); -static void open_mime(nkf_char c); -static void close_mime(void); -static void eof_mime(void); -static void mimeout_addchar(nkf_char c); -#ifndef PERL_XS -static void usage(void); -static void version(void); -static void show_configuration(void); -#endif -static void options(unsigned char *c); -static void reinit(void); + +static void mime_putc(nkf_char c); /* buffers */ @@ -582,19 +382,8 @@ static void reinit(void); static unsigned char stdibuf[IOBUF_SIZE]; static unsigned char stdobuf[IOBUF_SIZE]; #endif -static unsigned char hold_buf[HOLD_SIZE*2]; -static int hold_count = 0; - -/* MIME preprocessor fifo */ -#define MIME_BUF_SIZE (1024) /* 2^n ring buffer */ -#define MIME_BUF_MASK (MIME_BUF_SIZE-1) -#define Fifo(n) mime_buf[(n)&MIME_BUF_MASK] -static unsigned char mime_buf[MIME_BUF_SIZE]; -static unsigned int mime_top = 0; -static unsigned int mime_last = 0; /* decoded */ -static unsigned int mime_input = 0; /* undecoded */ -static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL; +#define NKF_UNSPECIFIED (-TRUE) /* flags */ static int unbuf_f = FALSE; @@ -610,50 +399,41 @@ static int mimebuf_f = FALSE; /* MIME buffered input */ static int broken_f = FALSE; /* convert ESC-less broken JIS */ static int iso8859_f = FALSE; /* ISO8859 through */ static int mimeout_f = FALSE; /* base64 mode */ -static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */ +static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */ static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */ #ifdef UNICODE_NORMALIZATION static int nfc_f = FALSE; static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */ static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc; -static nkf_char nfc_getc(FILE *f); -static nkf_char nfc_ungetc(nkf_char c,FILE *f); #endif #ifdef INPUT_OPTION static int cap_f = FALSE; static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */ static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc; -static nkf_char cap_getc(FILE *f); -static nkf_char cap_ungetc(nkf_char c,FILE *f); static int url_f = FALSE; static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */ static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc; -static nkf_char url_getc(FILE *f); -static nkf_char url_ungetc(nkf_char c,FILE *f); #endif -#if defined(INT_IS_SHORT) -#define NKF_INT32_C(n) (n##L) -#else -#define NKF_INT32_C(n) (n) -#endif -#define PREFIX_EUCG3 NKF_INT32_C(0x8F00) -#define CLASS_MASK NKF_INT32_C(0xFF000000) -#define CLASS_UNICODE NKF_INT32_C(0x01000000) -#define VALUE_MASK NKF_INT32_C(0x00FFFFFF) -#define UNICODE_MAX NKF_INT32_C(0x0010FFFF) -#define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE) -#define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF)) +#define PREFIX_EUCG3 NKF_INT32_C(0x8F00) +#define CLASS_MASK NKF_INT32_C(0xFF000000) +#define CLASS_UNICODE NKF_INT32_C(0x01000000) +#define VALUE_MASK NKF_INT32_C(0x00FFFFFF) +#define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF) +#define UNICODE_MAX NKF_INT32_C(0x0010FFFF) +#define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3) +#define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE) +#define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE) +#define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX) +#define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX) #ifdef NUMCHAR_OPTION static int numchar_f = FALSE; static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */ static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc; -static nkf_char numchar_getc(FILE *f); -static nkf_char numchar_ungetc(nkf_char c,FILE *f); #endif #ifdef CHECK_OPTION @@ -665,10 +445,7 @@ static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0; #endif static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */ -#if !defined PERL_XS -static void print_guessed_code(char *filename); -#endif -static void set_input_codename(char *codename); +static void set_input_codename(const char *codename); #ifdef EXEC_IO static int exec_f = 0; @@ -684,23 +461,11 @@ static int cp932inv_f = TRUE; /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */ #endif /* SHIFTJIS_CP932 */ -#ifdef X0212_ENABLE static int x0212_f = FALSE; -static nkf_char x0212_shift(nkf_char c); -static nkf_char x0212_unshift(nkf_char c); -#endif static int x0213_f = FALSE; static unsigned char prefix_table[256]; -static void set_code_score(struct input_code *ptr, nkf_char score); -static void clr_code_score(struct input_code *ptr, nkf_char score); -static void status_disable(struct input_code *ptr); -static void status_push_ch(struct input_code *ptr, nkf_char c); -static void status_clear(struct input_code *ptr); -static void status_reset(struct input_code *ptr); -static void status_reinit(struct input_code *ptr); -static void status_check(struct input_code *ptr, nkf_char c); static void e_status(struct input_code *, nkf_char); static void s_status(struct input_code *, nkf_char); @@ -709,10 +474,10 @@ struct input_code input_code_list[] = { {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0}, #ifdef UTF8_INPUT_ENABLE {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0}, - {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0}, - {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0}, + {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0}, + {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0}, #endif - {0} + {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0} }; static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */ @@ -738,28 +503,28 @@ static unsigned char ascii_intro = DEFAULT_R; static int fold_margin = FOLD_MARGIN; -/* converters */ +/* process default */ -#ifdef DEFAULT_CODE_JIS -# define DEFAULT_CONV j_oconv -#endif -#ifdef DEFAULT_CODE_SJIS -# define DEFAULT_CONV s_oconv -#endif -#ifdef DEFAULT_CODE_EUC -# define DEFAULT_CONV e_oconv -#endif -#ifdef DEFAULT_CODE_UTF8 -# define DEFAULT_CONV w_oconv -#endif +static nkf_char +no_connection2(nkf_char c2, nkf_char c1, nkf_char c0) +{ + fprintf(stderr,"nkf internal module connection failure.\n"); + exit(EXIT_FAILURE); + return 0; /* LINT */ +} + +static void +no_connection(nkf_char c2, nkf_char c1) +{ + no_connection2(c2,c1,0); +} -/* process default */ static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2; static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection; static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection; static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection; -static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection; +static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection; static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection; static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection; static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection; @@ -785,9 +550,8 @@ static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */ static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc; /* Global states */ -static int output_mode = ASCII, /* output kanji mode */ - input_mode = ASCII, /* input kanji mode */ - shift_mode = FALSE; /* TRUE shift out, or X0201 */ +static int output_mode = ASCII; /* output kanji mode */ +static int input_mode = ASCII; /* input kanji mode */ static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */ /* X0201 / X0208 conversion tables */ @@ -877,47 +641,67 @@ static const unsigned char fv[] = { +static int option_mode = 0; static int file_out_f = FALSE; #ifdef OVERWRITE static int overwrite_f = FALSE; static int preserve_time_f = FALSE; static int backup_f = FALSE; static char *backup_suffix = ""; -static char *get_backup_filename(const char *suffix, const char *filename); #endif -static int nlmode_f = 0; /* CR, LF, CRLF */ -static int input_newline = 0; /* 0: unestablished, EOF: MIXED */ +static int eolmode_f = 0; /* CR, LF, CRLF */ +static int input_eol = 0; /* 0: unestablished, EOF: MIXED */ static nkf_char prev_cr = 0; /* CR or 0 */ #ifdef EASYWIN /*Easy Win */ static int end_check; #endif /*Easy Win */ -#define STD_GC_BUFSIZE (256) -nkf_char std_gc_buf[STD_GC_BUFSIZE]; -nkf_char std_gc_ndx; +static void * +nkf_xmalloc(size_t size) +{ + void *ptr; + + if (size == 0) size = 1; + + ptr = malloc(size); + if (ptr == NULL) { + perror("can't malloc"); + exit(EXIT_FAILURE); + } + + return ptr; +} -char* nkf_strcpy(const char *str) +static void * +nkf_xrealloc(void *ptr, size_t size) { - char* result = malloc(strlen(str) + 1); - if (!result){ - perror(str); - return ""; + if (size == 0) size = 1; + + ptr = realloc(ptr, size); + if (ptr == NULL) { + perror("can't realloc"); + exit(EXIT_FAILURE); } - strcpy(result, str); - return result; + + return ptr; } -static void nkf_str_upcase(const char *src, char *dest, size_t length) +#define nkf_xfree(ptr) free(ptr) + +static int +nkf_str_caseeql(const char *src, const char *target) { - int i = 0; - for (; i < length && src[i]; i++) { - dest[i] = nkf_toupper(src[i]); + int i; + for (i = 0; src[i] && target[i]; i++) { + if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE; } - dest[i] = 0; + if (src[i] || target[i]) return FALSE; + else return TRUE; } -static nkf_encoding *nkf_enc_from_index(int idx) +static nkf_encoding* +nkf_enc_from_index(int idx) { if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) { return 0; @@ -925,18 +709,21 @@ static nkf_encoding *nkf_enc_from_index(int idx) return &nkf_encoding_table[idx]; } -static int nkf_enc_find_index(const char *name) +static int +nkf_enc_find_index(const char *name) { - int i, index = -1; + int i; + if (name[0] == 'X' && *(name+1) == '-') name += 2; for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) { - if (strcmp(name, encoding_name_to_id_table[i].name) == 0) { + if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) { return encoding_name_to_id_table[i].id; } } - return index; + return -1; } -static nkf_encoding *nkf_enc_find(const char *name) +static nkf_encoding* +nkf_enc_find(const char *name) { int idx = -1; idx = nkf_enc_find_index(name); @@ -950,296 +737,262 @@ static nkf_encoding *nkf_enc_find(const char *name) #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv #define nkf_enc_asciicompat(enc) (\ - nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\ - nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP) + nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\ + nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP) #define nkf_enc_unicode_p(enc) (\ - nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\ - nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\ - nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32) + nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\ + nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\ + nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32) #define nkf_enc_cp5022x_p(enc) (\ - nkf_enc_to_index(enc) == CP50220 ||\ - nkf_enc_to_index(enc) == CP50221 ||\ - nkf_enc_to_index(enc) == CP50222) + nkf_enc_to_index(enc) == CP50220 ||\ + nkf_enc_to_index(enc) == CP50221 ||\ + nkf_enc_to_index(enc) == CP50222) -#ifdef WIN32DLL -#include "nkf32dll.c" -#elif defined(PERL_XS) -#else /* WIN32DLL */ -int main(int argc, char **argv) +#ifdef DEFAULT_CODE_LOCALE +static const char* +nkf_locale_charmap() { - FILE *fin; - unsigned char *cp; +#ifdef HAVE_LANGINFO_H + return nl_langinfo(CODESET); +#elif defined(__WIN32__) + static char buf[16]; + sprintf(buf, "CP%d", GetACP()); + return buf; +#elif defined(__OS2__) +# if defined(INT_IS_SHORT) + /* OS/2 1.x */ + return NULL; +# else + /* OS/2 32bit */ + static char buf[16]; + ULONG ulCP[1], ulncp; + DosQueryCp(sizeof(ulCP), ulCP, &ulncp); + if (ulCP[0] == 932 || ulCP[0] == 943) + strcpy(buf, "Shift_JIS"); + else + sprintf(buf, "CP%lu", ulCP[0]); + return buf; +# endif +#endif + return NULL; +} - char *outfname = NULL; - char *origfname; +static nkf_encoding* +nkf_locale_encoding() +{ + nkf_encoding *enc = 0; + const char *encname = nkf_locale_charmap(); + if (encname) + enc = nkf_enc_find(encname); + return enc; +} +#endif /* DEFAULT_CODE_LOCALE */ -#ifdef EASYWIN /*Easy Win */ - _BufferSize.y = 400;/*Set Scroll Buffer Size*/ -#endif +static nkf_encoding* +nkf_utf8_encoding() +{ + return &nkf_encoding_table[UTF_8]; +} - for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) { - cp = (unsigned char *)*argv; - options(cp); - if (guess_f) { -#ifdef CHECK_OPTION - int debug_f_back = debug_f; -#endif -#ifdef EXEC_IO - int exec_f_back = exec_f; -#endif -#ifdef X0212_ENABLE - int x0212_f_back = x0212_f; -#endif - int x0213_f_back = x0213_f; - int guess_f_back = guess_f; - reinit(); - guess_f = guess_f_back; - mime_f = FALSE; -#ifdef CHECK_OPTION - debug_f = debug_f_back; -#endif -#ifdef EXEC_IO - exec_f = exec_f_back; -#endif -#ifdef X0212_ENABLE - x0212_f = x0212_f_back; -#endif - x0213_f = x0213_f_back; - } -#ifdef EXEC_IO - if (exec_f){ - int fds[2], pid; - if (pipe(fds) < 0 || (pid = fork()) < 0){ - abort(); - } - if (pid == 0){ - if (exec_f > 0){ - close(fds[0]); - dup2(fds[1], 1); - }else{ - close(fds[1]); - dup2(fds[0], 0); - } - execvp(argv[1], &argv[1]); - } - if (exec_f > 0){ - close(fds[1]); - dup2(fds[0], 0); - }else{ - close(fds[0]); - dup2(fds[1], 1); - } - argc = 0; - break; - } +static nkf_encoding* +nkf_default_encoding() +{ + nkf_encoding *enc = 0; +#ifdef DEFAULT_CODE_LOCALE + enc = nkf_locale_encoding(); +#elif defined(DEFAULT_ENCIDX) + enc = nkf_enc_from_index(DEFAULT_ENCIDX); +#endif + if (!enc) enc = nkf_utf8_encoding(); + return enc; +} + +typedef struct { + long capa; + long len; + nkf_char *ptr; +} nkf_buf_t; + +static nkf_buf_t * +nkf_buf_new(int length) +{ + nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t)); + buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length); + buf->capa = length; + buf->len = 0; + return buf; +} + +#if 0 +static void +nkf_buf_dispose(nkf_buf_t *buf) +{ + nkf_xfree(buf->ptr); + nkf_xfree(buf); +} #endif + +#define nkf_buf_length(buf) ((buf)->len) +#define nkf_buf_empty_p(buf) ((buf)->len == 0) + +static nkf_char +nkf_buf_at(nkf_buf_t *buf, int index) +{ + assert(index <= buf->len); + return buf->ptr[index]; +} + +static void +nkf_buf_clear(nkf_buf_t *buf) +{ + buf->len = 0; +} + +static void +nkf_buf_push(nkf_buf_t *buf, nkf_char c) +{ + if (buf->capa <= buf->len) { + exit(EXIT_FAILURE); } + buf->ptr[buf->len++] = c; +} - if (binmode_f == TRUE) -#if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) - if (freopen("","wb",stdout) == NULL) - return (-1); -#else - setbinmode(stdout); +static nkf_char +nkf_buf_pop(nkf_buf_t *buf) +{ + assert(!nkf_buf_empty_p(buf)); + return buf->ptr[--buf->len]; +} + +/* Normalization Form C */ +#ifndef PERL_XS +#ifdef WIN32DLL +#define fprintf dllprintf #endif - if (unbuf_f) - setbuf(stdout, (char *) NULL); - else - setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE); +static void +version(void) +{ + fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n"); +} - if (argc == 0) { - if (binmode_f == TRUE) -#if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) - if (freopen("","rb",stdin) == NULL) return (-1); +static void +usage(void) +{ + fprintf(HELP_OUTPUT, + "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n" +#ifdef UTF8_OUTPUT_ENABLE + " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" + " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n" #else - setbinmode(stdin); -#endif - setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE); - if (nop_f) - noconvert(stdin); - else { - kanji_convert(stdin); - if (guess_f) print_guessed_code(NULL); - } - } else { - int nfiles = argc; - int is_argument_error = FALSE; - while (argc--) { - input_codename = NULL; - input_newline = 0; -#ifdef CHECK_OPTION - iconv_for_check = 0; #endif - if ((fin = fopen((origfname = *argv++), "r")) == NULL) { - perror(*(argv-1)); - is_argument_error = TRUE; - continue; - } else { +#ifdef UTF8_INPUT_ENABLE + " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" + " UTF option is -W[8,[16,32][B,L]]\n" +#else + " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n" +#endif + ); + fprintf(HELP_OUTPUT, + " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n" + " M[BQ] MIME encode [B:base64 Q:quoted]\n" + " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n" + ); + fprintf(HELP_OUTPUT, + " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n" + " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n" + " 4: JISX0208 Katakana to JISX0201 Katakana\n" + " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n" + ); + fprintf(HELP_OUTPUT, + " O Output to File (DEFAULT 'nkf.out')\n" + " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n" + ); + fprintf(HELP_OUTPUT, + " --ic= Specify the input encoding\n" + " --oc= Specify the output encoding\n" + " --hiragana --katakana Hiragana/Katakana Conversion\n" + " --katakana-hiragana Converts each other\n" + ); + fprintf(HELP_OUTPUT, +#ifdef INPUT_OPTION + " --{cap, url}-input Convert hex after ':' or '%%'\n" +#endif +#ifdef NUMCHAR_OPTION + " --numchar-input Convert Unicode Character Reference\n" +#endif +#ifdef UTF8_INPUT_ENABLE + " --fb-{skip, html, xml, perl, java, subchar}\n" + " Specify unassigned character's replacement\n" +#endif + ); + fprintf(HELP_OUTPUT, #ifdef OVERWRITE - int fd = 0; - int fd_backup = 0; + " --in-place[=SUF] Overwrite original files\n" + " --overwrite[=SUF] Preserve timestamp of original files\n" #endif + " -g --guess Guess the input code\n" + " -v --version Print the version\n" + " --help/-V Print this help / configuration\n" + ); + version(); +} -/* reopen file for stdout */ - if (file_out_f == TRUE) { -#ifdef OVERWRITE - if (overwrite_f){ - outfname = malloc(strlen(origfname) - + strlen(".nkftmpXXXXXX") - + 1); - if (!outfname){ - perror(origfname); - return -1; - } - strcpy(outfname, origfname); -#ifdef MSDOS - { - int i; - for (i = strlen(outfname); i; --i){ - if (outfname[i - 1] == '/' - || outfname[i - 1] == '\\'){ - break; - } - } - outfname[i] = '\0'; - } - strcat(outfname, "ntXXXXXX"); - mktemp(outfname); - fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, - S_IREAD | S_IWRITE); -#else - strcat(outfname, ".nkftmpXXXXXX"); - fd = mkstemp(outfname); -#endif - if (fd < 0 - || (fd_backup = dup(fileno(stdout))) < 0 - || dup2(fd, fileno(stdout)) < 0 - ){ - perror(origfname); - return -1; - } - }else -#endif - if(argc == 1) { - outfname = *argv++; - argc--; - } else { - outfname = "nkf.out"; - } - - if(freopen(outfname, "w", stdout) == NULL) { - perror (outfname); - return (-1); - } - if (binmode_f == TRUE) { -#if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) - if (freopen("","wb",stdout) == NULL) - return (-1); +static void +show_configuration(void) +{ + fprintf(HELP_OUTPUT, + "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n" + " Compile-time options:\n" + " Compiled at: " __DATE__ " " __TIME__ "\n" + ); + fprintf(HELP_OUTPUT, + " Default output encoding: " +#ifdef DEFAULT_CODE_LOCALE + "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding()) +#elif defined(DEFAULT_ENCIDX) + "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding()) #else - setbinmode(stdout); + "NONE\n" #endif - } - } - if (binmode_f == TRUE) -#if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) - if (freopen("","rb",fin) == NULL) - return (-1); -#else - setbinmode(fin); -#endif - setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE); - if (nop_f) - noconvert(fin); - else { - char *filename = NULL; - kanji_convert(fin); - if (nfiles > 1) filename = origfname; - if (guess_f) print_guessed_code(filename); - } - fclose(fin); -#ifdef OVERWRITE - if (overwrite_f) { - struct stat sb; -#if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__) - time_t tb[2]; + ); + fprintf(HELP_OUTPUT, + " Default output end of line: " +#if DEFAULT_NEWLINE == CR + "CR" +#elif DEFAULT_NEWLINE == CRLF + "CRLF" #else - struct utimbuf tb; -#endif - - fflush(stdout); - close(fd); - if (dup2(fd_backup, fileno(stdout)) < 0){ - perror("dup2"); - } - if (stat(origfname, &sb)) { - fprintf(stderr, "Can't stat %s\n", origfname); - } - /* パーミッションを復元 */ - if (chmod(outfname, sb.st_mode)) { - fprintf(stderr, "Can't set permission %s\n", outfname); - } - - /* タイムスタンプを復元 */ - if(preserve_time_f){ -#if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__) - tb[0] = tb[1] = sb.st_mtime; - if (utime(outfname, tb)) { - fprintf(stderr, "Can't set timestamp %s\n", outfname); - } + "LF" +#endif + "\n" + " Decode MIME encoded string: " +#if MIME_DECODE_DEFAULT + "ON" #else - tb.actime = sb.st_atime; - tb.modtime = sb.st_mtime; - if (utime(outfname, &tb)) { - fprintf(stderr, "Can't set timestamp %s\n", outfname); - } + "OFF" #endif - } - if(backup_f){ - char *backup_filename = get_backup_filename(backup_suffix, origfname); -#ifdef MSDOS - unlink(backup_filename); + "\n" + " Convert JIS X 0201 Katakana: " +#if X0201_DEFAULT + "ON" +#else + "OFF" #endif - if (rename(origfname, backup_filename)) { - perror(backup_filename); - fprintf(stderr, "Can't rename %s to %s\n", - origfname, backup_filename); - } - }else{ -#ifdef MSDOS - if (unlink(origfname)){ - perror(origfname); - } + "\n" + " --help, --version output: " +#if HELP_OUTPUT_HELP_OUTPUT + "HELP_OUTPUT" +#else + "STDOUT" #endif - } - if (rename(outfname, origfname)) { - perror(origfname); - fprintf(stderr, "Can't rename %s to %s\n", - outfname, origfname); - } - free(outfname); - } -#endif - } - } - if (is_argument_error) - return(-1); - } -#ifdef EASYWIN /*Easy Win */ - if (file_out_f == FALSE) - scanf("%d",&end_check); - else - fclose(stdout); -#else /* for Other OS */ - if (file_out_f == TRUE) - fclose(stdout); -#endif /*Easy Win */ - return (0); + "\n"); } -#endif /* WIN32DLL */ +#endif /*PERL_XS*/ #ifdef OVERWRITE -char *get_backup_filename(const char *suffix, const char *filename) +static char* +get_backup_filename(const char *suffix, const char *filename) { char *backup_filename; int asterisk_count = 0; @@ -1251,12 +1004,7 @@ char *get_backup_filename(const char *suffix, const char *filename) } if(asterisk_count){ - backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1); - if (!backup_filename){ - perror("Can't malloc backup filename."); - return NULL; - } - + backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1); for(i = 0, j = 0; suffix[i];){ if(suffix[i] == '*'){ backup_filename[j] = '\0'; @@ -1269,8 +1017,8 @@ char *get_backup_filename(const char *suffix, const char *filename) } backup_filename[j] = '\0'; }else{ - j = strlen(suffix) + filename_length; - backup_filename = malloc( + 1); + j = filename_length + strlen(suffix); + backup_filename = nkf_xmalloc(j + 1); strcpy(backup_filename, filename); strcat(backup_filename, suffix); backup_filename[j] = '\0'; @@ -1279,6 +1027,101 @@ char *get_backup_filename(const char *suffix, const char *filename) } #endif +#ifdef UTF8_INPUT_ENABLE +static void +nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c) +{ + int shift = 20; + c &= VALUE_MASK; + while(shift >= 0){ + if(c >= NKF_INT32_C(1)<= 0){ + (*f)(0, bin2hex(c>>shift)); + shift -= 4; + } + }else{ + shift -= 4; + } + } + return; +} + +static void +encode_fallback_html(nkf_char c) +{ + (*oconv)(0, '&'); + (*oconv)(0, '#'); + c &= VALUE_MASK; + if(c >= NKF_INT32_C(1000000)) + (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10); + if(c >= NKF_INT32_C(100000)) + (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10); + if(c >= 10000) + (*oconv)(0, 0x30+(c/10000 )%10); + if(c >= 1000) + (*oconv)(0, 0x30+(c/1000 )%10); + if(c >= 100) + (*oconv)(0, 0x30+(c/100 )%10); + if(c >= 10) + (*oconv)(0, 0x30+(c/10 )%10); + if(c >= 0) + (*oconv)(0, 0x30+ c %10); + (*oconv)(0, ';'); + return; +} + +static void +encode_fallback_xml(nkf_char c) +{ + (*oconv)(0, '&'); + (*oconv)(0, '#'); + (*oconv)(0, 'x'); + nkf_each_char_to_hex(oconv, c); + (*oconv)(0, ';'); + return; +} + +static void +encode_fallback_java(nkf_char c) +{ + (*oconv)(0, '\\'); + c &= VALUE_MASK; + if(!nkf_char_unicode_bmp_p(c)){ + (*oconv)(0, 'U'); + (*oconv)(0, '0'); + (*oconv)(0, '0'); + (*oconv)(0, bin2hex(c>>20)); + (*oconv)(0, bin2hex(c>>16)); + }else{ + (*oconv)(0, 'u'); + } + (*oconv)(0, bin2hex(c>>12)); + (*oconv)(0, bin2hex(c>> 8)); + (*oconv)(0, bin2hex(c>> 4)); + (*oconv)(0, bin2hex(c )); + return; +} + +static void +encode_fallback_perl(nkf_char c) +{ + (*oconv)(0, '\\'); + (*oconv)(0, 'x'); + (*oconv)(0, '{'); + nkf_each_char_to_hex(oconv, c); + (*oconv)(0, '}'); + return; +} + +static void +encode_fallback_subchar(nkf_char c) +{ + c = unicode_subchar; + (*oconv)((c>>8)&0xFF, c&0xFF); + return; +} +#endif + static const struct { const char *name; const char *alias; @@ -1289,7 +1132,7 @@ static const struct { {"euc","e"}, {"euc-input","E"}, {"fj","jm"}, - {"help","v"}, + {"help",""}, {"jis","j"}, {"jis-input","J"}, {"mac","sLm"}, @@ -1299,7 +1142,7 @@ static const struct { {"sjis","s"}, {"sjis-input","S"}, {"unix","eLu"}, - {"version","V"}, + {"version","v"}, {"windows","sLw"}, {"hiragana","h1"}, {"katakana","h2"}, @@ -1359,2085 +1202,835 @@ static const struct { {"prefix=", ""}, }; -static int option_mode = 0; - -void options(unsigned char *cp) +static void +set_input_encoding(nkf_encoding *enc) { - nkf_char i, j; - unsigned char *p; - unsigned char *cp_back = NULL; - char codeset[32]; - nkf_encoding *enc; - - if (option_mode==1) - return; - while(*cp && *cp++!='-'); - while (*cp || cp_back) { - if(!*cp){ - cp = cp_back; - cp_back = NULL; - continue; - } - p = 0; - switch (*cp++) { - case '-': /* literal options */ - if (!*cp || *cp == SP) { /* ignore the rest of arguments */ - option_mode = 1; - return; - } - for (i=0;iname){ - if (iconv_func == p->iconv_func){ - return p; - } - p++; - } + struct input_code *p = input_code_list; + while (p->name){ + if (iconv_func == p->iconv_func){ + return p; + } + p++; + } } return 0; } -void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0)) +static void +set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0)) { #ifdef INPUT_CODE_FIX if (f || !input_encoding) #endif - if (estab_f != f){ - estab_f = f; - } + if (estab_f != f){ + estab_f = f; + } if (iconv_func #ifdef INPUT_CODE_FIX - && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */ + && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */ #endif - ){ - iconv = iconv_func; + ){ + iconv = iconv_func; } #ifdef CHECK_OPTION if (estab_f && iconv_for_check != iconv){ - struct input_code *p = find_inputcode_byfunc(iconv); - if (p){ - set_input_codename(p->name); - debug(p->name); - } - iconv_for_check = iconv; + struct input_code *p = find_inputcode_byfunc(iconv); + if (p){ + set_input_codename(p->name); + debug(p->name); + } + iconv_for_check = iconv; } #endif } -#define SCORE_L2 (1) /* 第2水準漢字 */ -#define SCORE_KANA (SCORE_L2 << 1) /* いわゆる半角カナ */ -#define SCORE_DEPEND (SCORE_KANA << 1) /* 機種依存文字 */ -#define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932 による読み換え (IBM extended characters) */ -#define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */ -#define SCORE_NO_EXIST (SCORE_X0212 << 1) /* 存在しない文字 */ -#define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME による指定 */ -#define SCORE_ERROR (SCORE_iMIME << 1) /* エラー */ - -#define SCORE_INIT (SCORE_iMIME) - -static const char score_table_A0[] = { - 0, 0, 0, 0, - 0, 0, 0, 0, - 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, - SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST, -}; +#ifdef X0212_ENABLE +static nkf_char +x0212_shift(nkf_char c) +{ + nkf_char ret = c; + c &= 0x7f; + if (is_eucg3(ret)){ + if (0x75 <= c && c <= 0x7f){ + ret = c + (0x109 - 0x75); + } + }else{ + if (0x75 <= c && c <= 0x7f){ + ret = c + (0x113 - 0x75); + } + } + return ret; +} -static const char score_table_F0[] = { - SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2, - SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, - SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932, - SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR, -}; -void set_code_score(struct input_code *ptr, nkf_char score) +static nkf_char +x0212_unshift(nkf_char c) { - if (ptr){ - ptr->score |= score; + nkf_char ret = c; + if (0x7f <= c && c <= 0x88){ + ret = c + (0x75 - 0x7f); + }else if (0x89 <= c && c <= 0x92){ + ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89)); } + return ret; } +#endif /* X0212_ENABLE */ -void clr_code_score(struct input_code *ptr, nkf_char score) +static nkf_char +e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) { - if (ptr){ - ptr->score &= ~score; + nkf_char ndx; + if (is_eucg3(c2)){ + ndx = c2 & 0x7f; + if (x0213_f){ + if((0x21 <= ndx && ndx <= 0x2F)){ + if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3; + if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); + return 0; + }else if(0x6E <= ndx && ndx <= 0x7E){ + if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe; + if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); + return 0; + } + return 1; + } +#ifdef X0212_ENABLE + else if(nkf_isgraph(ndx)){ + nkf_char val = 0; + const unsigned short *ptr; + ptr = x0212_shiftjis[ndx - 0x21]; + if (ptr){ + val = ptr[(c1 & 0x7f) - 0x21]; + } + if (val){ + c2 = val >> 8; + c1 = val & 0xff; + if (p2) *p2 = c2; + if (p1) *p1 = c1; + return 0; + } + c2 = x0212_shift(c2); + } +#endif /* X0212_ENABLE */ } + if(0x7F < c2) return 1; + if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1); + if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); + return 0; } -void code_score(struct input_code *ptr) +static nkf_char +s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) { - nkf_char c2 = ptr->buf[0]; -#ifdef UTF8_OUTPUT_ENABLE - nkf_char c1 = ptr->buf[1]; +#if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE) + nkf_char val; #endif - if (c2 < 0){ - set_code_score(ptr, SCORE_ERROR); - }else if (c2 == SSO){ - set_code_score(ptr, SCORE_KANA); - }else if (c2 == 0x8f){ - set_code_score(ptr, SCORE_X0212); -#ifdef UTF8_OUTPUT_ENABLE - }else if (!e2w_conv(c2, c1)){ - set_code_score(ptr, SCORE_NO_EXIST); -#endif - }else if ((c2 & 0x70) == 0x20){ - set_code_score(ptr, score_table_A0[c2 & 0x0f]); - }else if ((c2 & 0x70) == 0x70){ - set_code_score(ptr, score_table_F0[c2 & 0x0f]); - }else if ((c2 & 0x70) >= 0x50){ - set_code_score(ptr, SCORE_L2); + static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} }; + if (0xFC < c1) return 1; +#ifdef SHIFTJIS_CP932 + if (!cp932inv_f && is_ibmext_in_sjis(c2)){ + val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40]; + if (val){ + c2 = val >> 8; + c1 = val & 0xff; + } } -} - -void status_disable(struct input_code *ptr) -{ - ptr->stat = -1; - ptr->buf[0] = -1; - code_score(ptr); - if (iconv == ptr->iconv_func) set_iconv(FALSE, 0); -} - -void status_push_ch(struct input_code *ptr, nkf_char c) -{ - ptr->buf[ptr->index++] = c; -} - -void status_clear(struct input_code *ptr) -{ - ptr->stat = 0; - ptr->index = 0; -} - -void status_reset(struct input_code *ptr) -{ - status_clear(ptr); - ptr->score = SCORE_INIT; -} - -void status_reinit(struct input_code *ptr) -{ - status_reset(ptr); - ptr->_file_stat = 0; -} - -void status_check(struct input_code *ptr, nkf_char c) -{ - if (c <= DEL && estab_f){ - status_reset(ptr); + if (cp932inv_f + && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){ + val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40]; + if (val){ + c2 = val >> 8; + c1 = val & 0xff; + } } -} - -void s_status(struct input_code *ptr, nkf_char c) -{ - switch(ptr->stat){ - case -1: - status_check(ptr, c); - break; - case 0: - if (c <= DEL){ - break; -#ifdef NUMCHAR_OPTION - }else if (is_unicode_capsule(c)){ - break; -#endif - }else if (0xa1 <= c && c <= 0xdf){ - status_push_ch(ptr, SSO); - status_push_ch(ptr, c); - code_score(ptr); - status_clear(ptr); - }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){ - ptr->stat = 1; - status_push_ch(ptr, c); - }else if (0xed <= c && c <= 0xee){ - ptr->stat = 3; - status_push_ch(ptr, c); -#ifdef SHIFTJIS_CP932 - }else if (is_ibmext_in_sjis(c)){ - ptr->stat = 2; - status_push_ch(ptr, c); #endif /* SHIFTJIS_CP932 */ #ifdef X0212_ENABLE - }else if (0xf0 <= c && c <= 0xfc){ - ptr->stat = 1; - status_push_ch(ptr, c); -#endif /* X0212_ENABLE */ - }else{ - status_disable(ptr); - } - break; - case 1: - if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){ - status_push_ch(ptr, c); - s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]); - code_score(ptr); - status_clear(ptr); - }else{ - status_disable(ptr); - } - break; - case 2: -#ifdef SHIFTJIS_CP932 - if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) { - status_push_ch(ptr, c); - if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) { - set_code_score(ptr, SCORE_CP932); - status_clear(ptr); - break; + if (!x0213_f && is_ibmext_in_sjis(c2)){ + val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40]; + if (val){ + if (val > 0x7FFF){ + c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f); + c1 = val & 0xff; + }else{ + c2 = val >> 8; + c1 = val & 0xff; } + if (p2) *p2 = c2; + if (p1) *p1 = c1; + return 0; } -#endif /* SHIFTJIS_CP932 */ - status_disable(ptr); - break; - case 3: - if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){ - status_push_ch(ptr, c); - s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]); - set_code_score(ptr, SCORE_CP932); - status_clear(ptr); - }else{ - status_disable(ptr); - } - break; } -} - -void e_status(struct input_code *ptr, nkf_char c) -{ - switch (ptr->stat){ - case -1: - status_check(ptr, c); - break; - case 0: - if (c <= DEL){ - break; -#ifdef NUMCHAR_OPTION - }else if (is_unicode_capsule(c)){ - break; #endif - }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){ - ptr->stat = 1; - status_push_ch(ptr, c); -#ifdef X0212_ENABLE - }else if (0x8f == c){ - ptr->stat = 2; - status_push_ch(ptr, c); -#endif /* X0212_ENABLE */ - }else{ - status_disable(ptr); - } - break; - case 1: - if (0xa1 <= c && c <= 0xfe){ - status_push_ch(ptr, c); - code_score(ptr); - status_clear(ptr); - }else{ - status_disable(ptr); - } - break; -#ifdef X0212_ENABLE - case 2: - if (0xa1 <= c && c <= 0xfe){ - ptr->stat = 1; - status_push_ch(ptr, c); - }else{ - status_disable(ptr); - } -#endif /* X0212_ENABLE */ - } -} - -#ifdef UTF8_INPUT_ENABLE -void w_status(struct input_code *ptr, nkf_char c) -{ - switch (ptr->stat){ - case -1: - status_check(ptr, c); - break; - case 0: - if (c <= DEL){ - break; -#ifdef NUMCHAR_OPTION - }else if (is_unicode_capsule(c)){ - break; -#endif - }else if (0xc0 <= c && c <= 0xdf){ - ptr->stat = 1; - status_push_ch(ptr, c); - }else if (0xe0 <= c && c <= 0xef){ - ptr->stat = 2; - status_push_ch(ptr, c); - }else if (0xf0 <= c && c <= 0xf4){ - ptr->stat = 3; - status_push_ch(ptr, c); - }else{ - status_disable(ptr); - } - break; - case 1: - case 2: - if (0x80 <= c && c <= 0xbf){ - status_push_ch(ptr, c); - if (ptr->index > ptr->stat){ - int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb - && ptr->buf[2] == 0xbf); - w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2], - &ptr->buf[0], &ptr->buf[1]); - if (!bom){ - code_score(ptr); - } - status_clear(ptr); - } - }else{ - status_disable(ptr); - } - break; - case 3: - if (0x80 <= c && c <= 0xbf){ - if (ptr->index < ptr->stat){ - status_push_ch(ptr, c); - } else { - status_clear(ptr); + if(c2 >= 0x80){ + if(x0213_f && c2 >= 0xF0){ + if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */ + c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1]; + }else{ /* 78<=k<=94 */ + c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B); + if (0x9E < c1) c2++; } - }else{ - status_disable(ptr); - } - break; - } -} -#endif - -void code_status(nkf_char c) -{ - int action_flag = 1; - struct input_code *result = 0; - struct input_code *p = input_code_list; - while (p->name){ - if (!p->status_func) { - ++p; - continue; + }else{ +#define SJ0162 0x00e1 /* 01 - 62 ku offset */ +#define SJ6394 0x0161 /* 63 - 94 ku offset */ + c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394); + if (0x9E < c1) c2++; + } + if (c1 < 0x9F) + c1 = c1 - ((c1 > DEL) ? SP : 0x1F); + else { + c1 = c1 - 0x7E; } - if (!p->status_func) - continue; - (p->status_func)(p, c); - if (p->stat > 0){ - action_flag = 0; - }else if(p->stat == 0){ - if (result){ - action_flag = 0; - }else{ - result = p; - } - } - ++p; - } - - if (action_flag){ - if (result && !estab_f){ - set_iconv(TRUE, result->iconv_func); - }else if (c <= DEL){ - struct input_code *ptr = input_code_list; - while (ptr->name){ - status_reset(ptr); - ++ptr; - } - } } -} -#ifndef WIN32DLL -nkf_char std_getc(FILE *f) -{ - if (std_gc_ndx){ - return std_gc_buf[--std_gc_ndx]; - } - return getc(f); +#ifdef X0212_ENABLE + c2 = x0212_unshift(c2); +#endif + if (p2) *p2 = c2; + if (p1) *p1 = c1; + return 0; } -#endif /*WIN32DLL*/ -nkf_char std_ungetc(nkf_char c, FILE *f) +#if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) +static void +nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4) { - if (std_gc_ndx == STD_GC_BUFSIZE){ - return EOF; + val &= VALUE_MASK; + if (val < 0x80){ + *p1 = val; + *p2 = 0; + *p3 = 0; + *p4 = 0; + }else if (val < 0x800){ + *p1 = 0xc0 | (val >> 6); + *p2 = 0x80 | (val & 0x3f); + *p3 = 0; + *p4 = 0; + } else if (nkf_char_unicode_bmp_p(val)) { + *p1 = 0xe0 | (val >> 12); + *p2 = 0x80 | ((val >> 6) & 0x3f); + *p3 = 0x80 | ( val & 0x3f); + *p4 = 0; + } else if (nkf_char_unicode_value_p(val)) { + *p1 = 0xf0 | (val >> 18); + *p2 = 0x80 | ((val >> 12) & 0x3f); + *p3 = 0x80 | ((val >> 6) & 0x3f); + *p4 = 0x80 | ( val & 0x3f); + } else { + *p1 = 0; + *p2 = 0; + *p3 = 0; + *p4 = 0; } - std_gc_buf[std_gc_ndx++] = c; - return c; -} - -#ifndef WIN32DLL -void std_putc(nkf_char c) -{ - if(c!=EOF) - putchar(c); } -#endif /*WIN32DLL*/ -#if !defined(PERL_XS) && !defined(WIN32DLL) -nkf_char noconvert(FILE *f) +static nkf_char +nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) { - nkf_char c; - - if (nop_f == 2) - module_connection(); - while ((c = (*i_getc)(f)) != EOF) - (*o_putc)(c); - (*o_putc)(EOF); - return 1; + nkf_char wc; + if (c1 <= 0x7F) { + /* single byte */ + wc = c1; + } + else if (c1 <= 0xC3) { + /* trail byte or invalid */ + return -1; + } + else if (c1 <= 0xDF) { + /* 2 bytes */ + wc = (c1 & 0x1F) << 6; + wc |= (c2 & 0x3F); + } + else if (c1 <= 0xEF) { + /* 3 bytes */ + wc = (c1 & 0x0F) << 12; + wc |= (c2 & 0x3F) << 6; + wc |= (c3 & 0x3F); + } + else if (c2 <= 0xF4) { + /* 4 bytes */ + wc = (c1 & 0x0F) << 18; + wc |= (c2 & 0x3F) << 12; + wc |= (c3 & 0x3F) << 6; + wc |= (c4 & 0x3F); + } + else { + return -1; + } + return wc; } #endif -void module_connection(void) +#ifdef UTF8_INPUT_ENABLE +static int +unicode_to_jis_common2(nkf_char c1, nkf_char c0, + const unsigned short *const *pp, nkf_char psize, + nkf_char *p2, nkf_char *p1) { - if (!output_encoding) output_encoding = nkf_enc_from_index(DEFAULT_ENCODING); - oconv = nkf_enc_to_oconv(output_encoding); - o_putc = std_putc; - - /* replace continucation module, from output side */ + nkf_char c2; + const unsigned short *p; + unsigned short val; - /* output redicrection */ -#ifdef CHECK_OPTION - if (noout_f || guess_f){ - o_putc = no_putc; - } -#endif - if (mimeout_f) { - o_mputc = o_putc; - o_putc = mime_putc; - if (mimeout_f == TRUE) { - o_base64conv = oconv; oconv = base64_conv; - } - /* base64_count = 0; */ - } + if (pp == 0) return 1; - if (nlmode_f || guess_f) { - o_nlconv = oconv; oconv = nl_conv; - } - if (rot_f) { - o_rot_conv = oconv; oconv = rot_conv; - } - if (iso2022jp_f) { - o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv; - } - if (hira_f) { - o_hira_conv = oconv; oconv = hira_conv; - } - if (fold_f) { - o_fconv = oconv; oconv = fold_conv; - f_line = 0; - } - if (alpha_f || x0201_f) { - o_zconv = oconv; oconv = z_conv; - } + c1 -= 0x80; + if (c1 < 0 || psize <= c1) return 1; + p = pp[c1]; + if (p == 0) return 1; - i_getc = std_getc; - i_ungetc = std_ungetc; - /* input redicrection */ -#ifdef INPUT_OPTION - if (cap_f){ - i_cgetc = i_getc; i_getc = cap_getc; - i_cungetc = i_ungetc; i_ungetc= cap_ungetc; - } - if (url_f){ - i_ugetc = i_getc; i_getc = url_getc; - i_uungetc = i_ungetc; i_ungetc= url_ungetc; - } -#endif -#ifdef NUMCHAR_OPTION - if (numchar_f){ - i_ngetc = i_getc; i_getc = numchar_getc; - i_nungetc = i_ungetc; i_ungetc= numchar_ungetc; - } -#endif -#ifdef UNICODE_NORMALIZATION - if (nfc_f){ - i_nfc_getc = i_getc; i_getc = nfc_getc; - i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc; - } -#endif - if (mime_f && mimebuf_f==FIXED_MIME) { - i_mgetc = i_getc; i_getc = mime_getc; - i_mungetc = i_ungetc; i_ungetc = mime_ungetc; - } - if (broken_f & 1) { - i_bgetc = i_getc; i_getc = broken_getc; - i_bungetc = i_ungetc; i_ungetc = broken_ungetc; - } - if (input_encoding) { - set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding)); - } else { - set_iconv(FALSE, e_iconv); - } + c0 -= 0x80; + if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1; + val = p[c0]; + if (val == 0) return 1; + if (no_cp932ext_f && ( + (val>>8) == 0x2D || /* NEC special characters */ + val > NKF_INT32_C(0xF300) /* IBM extended characters */ + )) return 1; - { - struct input_code *p = input_code_list; - while (p->name){ - status_reinit(p++); - } + c2 = val >> 8; + if (val > 0x7FFF){ + c2 &= 0x7f; + c2 |= PREFIX_EUCG3; } + if (c2 == SO) c2 = JIS_X_0201_1976_K; + c1 = val & 0xFF; + if (p2) *p2 = c2; + if (p1) *p1 = c1; + return 0; } -/* - * Check and Ignore BOM - */ -void check_bom(FILE *f) +static int +unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1) { - int c2; - switch(c2 = (*i_getc)(f)){ - case 0x00: - if((c2 = (*i_getc)(f)) == 0x00){ - if((c2 = (*i_getc)(f)) == 0xFE){ - if((c2 = (*i_getc)(f)) == 0xFF){ - if(!input_encoding){ - set_iconv(TRUE, w_iconv32); - } - if (iconv == w_iconv32) { - input_endian = ENDIAN_BIG; - return; - } - (*i_ungetc)(0xFF,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0xFE,f); - }else if(c2 == 0xFF){ - if((c2 = (*i_getc)(f)) == 0xFE){ - if(!input_encoding){ - set_iconv(TRUE, w_iconv32); - } - if (iconv == w_iconv32) { - input_endian = ENDIAN_2143; - return; - } - (*i_ungetc)(0xFF,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0xFF,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0x00,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0x00,f); - break; - case 0xEF: - if((c2 = (*i_getc)(f)) == 0xBB){ - if((c2 = (*i_getc)(f)) == 0xBF){ - if(!input_encoding){ - set_iconv(TRUE, w_iconv); + const unsigned short *const *pp; + const unsigned short *const *const *ppp; + static const char no_best_fit_chars_table_C2[] = + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2, + 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1}; + static const char no_best_fit_chars_table_C2_ms[] = + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, + 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0}; + static const char no_best_fit_chars_table_932_C2[] = + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0}; + static const char no_best_fit_chars_table_932_C3[] = + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1}; + nkf_char ret = 0; + + if(c2 < 0x80){ + *p2 = 0; + *p1 = c2; + }else if(c2 < 0xe0){ + if(no_best_fit_chars_f){ + if(ms_ucs_map_f == UCS_MAP_CP932){ + switch(c2){ + case 0xC2: + if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1; + break; + case 0xC3: + if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1; + break; } - if (iconv == w_iconv) { - return; + }else if(!cp932inv_f){ + switch(c2){ + case 0xC2: + if(no_best_fit_chars_table_C2[c1&0x3F]) return 1; + break; + case 0xC3: + if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1; + break; } - (*i_ungetc)(0xBF,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0xBB,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0xEF,f); - break; - case 0xFE: - if((c2 = (*i_getc)(f)) == 0xFF){ - if((c2 = (*i_getc)(f)) == 0x00){ - if((c2 = (*i_getc)(f)) == 0x00){ - if(!input_encoding){ - set_iconv(TRUE, w_iconv32); - } - if (iconv == w_iconv32) { - input_endian = ENDIAN_3412; - return; - } - (*i_ungetc)(0x00,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0x00,f); - }else (*i_ungetc)(c2,f); - if(!input_encoding){ - set_iconv(TRUE, w_iconv16); - } - if (iconv == w_iconv16) { - input_endian = ENDIAN_BIG; - return; - } - (*i_ungetc)(0xFF,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0xFE,f); - break; - case 0xFF: - if((c2 = (*i_getc)(f)) == 0xFE){ - if((c2 = (*i_getc)(f)) == 0x00){ - if((c2 = (*i_getc)(f)) == 0x00){ - if(!input_encoding){ - set_iconv(TRUE, w_iconv32); - } - if (iconv == w_iconv32) { - input_endian = ENDIAN_LITTLE; - return; + }else if(ms_ucs_map_f == UCS_MAP_MS){ + if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1; + }else if(ms_ucs_map_f == UCS_MAP_CP10001){ + switch(c2){ + case 0xC2: + switch(c1){ + case 0xA2: + case 0xA3: + case 0xA5: + case 0xA6: + case 0xAC: + case 0xAF: + case 0xB8: + return 1; } - (*i_ungetc)(0x00,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0x00,f); - }else (*i_ungetc)(c2,f); - if(!input_encoding){ - set_iconv(TRUE, w_iconv16); - } - if (iconv == w_iconv16) { - input_endian = ENDIAN_LITTLE; - return; + break; + } } - (*i_ungetc)(0xFE,f); - }else (*i_ungetc)(c2,f); - (*i_ungetc)(0xFF,f); - break; - default: - (*i_ungetc)(c2,f); - break; - } -} - -/* - Conversion main loop. Code detection only. - */ - -nkf_char kanji_convert(FILE *f) -{ - nkf_char c3, c2=0, c1, c0=0; - int is_8bit = FALSE; - - if (input_encoding && !nkf_enc_asciicompat(input_encoding)) { - is_8bit = TRUE; - } - - input_mode = ASCII; - output_mode = ASCII; - shift_mode = FALSE; - -#define NEXT continue /* no output, get next */ -#define SEND ; /* output c1 and c2, get next */ -#define LAST break /* end of loop, go closing */ - - module_connection(); - check_bom(f); - - while ((c1 = (*i_getc)(f)) != EOF) { -#ifdef INPUT_CODE_FIX - if (!input_encoding) -#endif - code_status(c1); - if (c2) { - /* second byte */ - if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) { - /* in case of 8th bit is on */ - if (!estab_f&&!mime_decode_mode) { - /* in case of not established yet */ - /* It is still ambiguious */ - if (h_conv(f, c2, c1)==EOF) - LAST; - else - c2 = 0; - NEXT; - } else { - /* in case of already established */ - if (c1 < AT) { - /* ignore bogus code and not CP5022x UCD */ - c2 = 0; - NEXT; - } else { - SEND; + } + pp = + ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 : + ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms : + ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac : + utf8_to_euc_2bytes; + ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1); + }else if(c0 < 0xF0){ + if(no_best_fit_chars_f){ + if(ms_ucs_map_f == UCS_MAP_CP932){ + if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1; + }else if(ms_ucs_map_f == UCS_MAP_MS){ + switch(c2){ + case 0xE2: + switch(c1){ + case 0x80: + if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1; + break; + case 0x88: + if(c0 == 0x92) return 1; + break; } + break; + case 0xE3: + if(c1 == 0x80 || c0 == 0x9C) return 1; + break; } - } else - /* second byte, 7 bit code */ - /* it might be kanji shitfted */ - if ((c1 == DEL) || (c1 <= SP)) { - /* ignore bogus first code */ - c2 = 0; - NEXT; - } else - SEND; - } else { - /* first byte */ -#ifdef UTF8_INPUT_ENABLE - if (iconv == w_iconv16) { - if (input_endian == ENDIAN_BIG) { - c2 = c1; - if ((c1 = (*i_getc)(f)) != EOF) { - if (0xD8 <= c2 && c2 <= 0xDB) { - if ((c0 = (*i_getc)(f)) != EOF) { - c0 <<= 8; - if ((c3 = (*i_getc)(f)) != EOF) { - c0 |= c3; - } else c2 = EOF; - } else c2 = EOF; - } - } else c2 = EOF; - } else { - if ((c2 = (*i_getc)(f)) != EOF) { - if (0xD8 <= c2 && c2 <= 0xDB) { - if ((c3 = (*i_getc)(f)) != EOF) { - if ((c0 = (*i_getc)(f)) != EOF) { - c0 <<= 8; - c0 |= c3; - } else c2 = EOF; - } else c2 = EOF; - } - } else c2 = EOF; + }else if(ms_ucs_map_f == UCS_MAP_CP10001){ + switch(c2){ + case 0xE3: + switch(c1){ + case 0x82: + if(c0 == 0x94) return 1; + break; + case 0x83: + if(c0 == 0xBB) return 1; + break; + } + break; } - SEND; - } else if(iconv == w_iconv32){ - int c3 = c1; - if((c2 = (*i_getc)(f)) != EOF && - (c1 = (*i_getc)(f)) != EOF && - (c0 = (*i_getc)(f)) != EOF){ - switch(input_endian){ - case ENDIAN_BIG: - c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF); + }else{ + switch(c2){ + case 0xE2: + switch(c1){ + case 0x80: + if(c0 == 0x95) return 1; + break; + case 0x88: + if(c0 == 0xA5) return 1; break; - case ENDIAN_LITTLE: - c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16; + } + break; + case 0xEF: + switch(c1){ + case 0xBC: + if(c0 == 0x8D) return 1; break; - case ENDIAN_2143: - c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8; + case 0xBD: + if(c0 == 0x9E && !cp932inv_f) return 1; break; - case ENDIAN_3412: - c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16; + case 0xBF: + if(0xA0 <= c0 && c0 <= 0xA5) return 1; break; } - c2 = 0; - }else{ - c2 = EOF; + break; } - SEND; - } else -#endif -#ifdef NUMCHAR_OPTION - if (is_unicode_capsule(c1)){ - SEND; - } else + } + } + ppp = + ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 : + ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms : + ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac : + utf8_to_euc_3bytes; + ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1); + }else return -1; +#ifdef SHIFTJIS_CP932 + if (!ret && !cp932inv_f && is_eucg3(*p2)) { + nkf_char s2, s1; + if (e2s_conv(*p2, *p1, &s2, &s1) == 0) { + s2e_conv(s2, s1, p2, p1); + }else{ + ret = 1; + } + } #endif - if (c1 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) { - /* 8 bit code */ - if (!estab_f && !iso8859_f) { - /* not established yet */ - c2 = c1; - NEXT; - } else { /* estab_f==TRUE */ - if (iso8859_f) { - c2 = ISO_8859_1; - c1 &= 0x7f; - SEND; - } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) { - /* SJIS X0201 Case... */ - if (iso2022jp_f && !x0201_f) { - (*oconv)(GETA1, GETA2); - NEXT; - } else { - c2 = JIS_X_0201; - c1 &= 0x7f; - SEND; - } - } else if (c1==SSO && iconv != s_iconv) { - /* EUC X0201 Case */ - c1 = (*i_getc)(f); /* skip SSO */ - code_status(c1); - if (SSP<=c1 && c1<0xe0) { - if (iso2022jp_f && !x0201_f) { - (*oconv)(GETA1, GETA2); - NEXT; - } else { - c2 = JIS_X_0201; - c1 &= 0x7f; - SEND; - } - } else { /* bogus code, skip SSO and one byte */ - NEXT; - } - } else if (ms_ucs_map_f == UCS_MAP_CP10001 && - (c1 == 0xFD || c1 == 0xFE)) { - /* CP10001 */ - c2 = JIS_X_0201; - c1 &= 0x7f; - SEND; - } else { - /* already established */ - c2 = c1; - NEXT; - } - } - } else if ((c1 > SP) && (c1 != DEL)) { - /* in case of Roman characters */ - if (shift_mode) { - /* output 1 shifted byte */ - if (iso8859_f) { - c2 = ISO_8859_1; - SEND; - } else if (SP <= c1 && c1 < (0xe0&0x7f)){ - /* output 1 shifted byte */ - if (iso2022jp_f && !x0201_f) { - (*oconv)(GETA1, GETA2); - NEXT; - } else { - c2 = JIS_X_0201; - SEND; - } - } else { - /* look like bogus code */ - NEXT; - } - } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 || - input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) { - /* in case of Kanji shifted */ - c2 = c1; - NEXT; - } else if (c1 == '=' && mime_f && !mime_decode_mode) { - /* Check MIME code */ - if ((c1 = (*i_getc)(f)) == EOF) { - (*oconv)(0, '='); - LAST; - } else if (c1 == '?') { - /* =? is mime conversion start sequence */ - if(mime_f == STRICT_MIME) { - /* check in real detail */ - if (mime_begin_strict(f) == EOF) - LAST; - else - NEXT; - } else if (mime_begin(f) == EOF) - LAST; - else - NEXT; - } else { - (*oconv)(0, '='); - (*i_ungetc)(c1,f); - NEXT; - } - } else { - /* normal ASCII code */ - SEND; - } - } else if (c1 == SI && (!is_8bit || mime_decode_mode)) { - shift_mode = FALSE; - NEXT; - } else if (c1 == SO && (!is_8bit || mime_decode_mode)) { - shift_mode = TRUE; - NEXT; - } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) { - if ((c1 = (*i_getc)(f)) == EOF) { - /* (*oconv)(0, ESC); don't send bogus code */ - LAST; - } else if (c1 == '$') { - if ((c1 = (*i_getc)(f)) == EOF) { - /* - (*oconv)(0, ESC); don't send bogus code - (*oconv)(0, '$'); */ - LAST; - } else if (c1 == '@'|| c1 == 'B') { - /* This is kanji introduction */ - input_mode = JIS_X_0208; - shift_mode = FALSE; - set_input_codename("ISO-2022-JP"); -#ifdef CHECK_OPTION - debug("ISO-2022-JP"); -#endif - NEXT; - } else if (c1 == '(') { - if ((c1 = (*i_getc)(f)) == EOF) { - /* don't send bogus code - (*oconv)(0, ESC); - (*oconv)(0, '$'); - (*oconv)(0, '('); - */ - LAST; - } else if (c1 == '@'|| c1 == 'B') { - /* This is kanji introduction */ - input_mode = JIS_X_0208; - shift_mode = FALSE; - NEXT; -#ifdef X0212_ENABLE - } else if (c1 == 'D'){ - input_mode = JIS_X_0212; - shift_mode = FALSE; - NEXT; -#endif /* X0212_ENABLE */ - } else if (c1 == 0x4F){ - input_mode = JIS_X_0213_1; - shift_mode = FALSE; - NEXT; - } else if (c1 == 0x50){ - input_mode = JIS_X_0213_2; - shift_mode = FALSE; - NEXT; - } else { - /* could be some special code */ - (*oconv)(0, ESC); - (*oconv)(0, '$'); - (*oconv)(0, '('); - (*oconv)(0, c1); - NEXT; - } - } else if (broken_f&0x2) { - /* accept any ESC-(-x as broken code ... */ - input_mode = JIS_X_0208; - shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, ESC); - (*oconv)(0, '$'); - (*oconv)(0, c1); - NEXT; - } - } else if (c1 == '(') { - if ((c1 = (*i_getc)(f)) == EOF) { - /* don't send bogus code - (*oconv)(0, ESC); - (*oconv)(0, '('); */ - LAST; - } else { - if (c1 == 'I') { - /* This is X0201 kana introduction */ - input_mode = JIS_X_0201; shift_mode = JIS_X_0201; - NEXT; - } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') { - /* This is X0208 kanji introduction */ - input_mode = ASCII; shift_mode = FALSE; - NEXT; - } else if (broken_f&0x2) { - input_mode = ASCII; shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, ESC); - (*oconv)(0, '('); - /* maintain various input_mode here */ - SEND; - } - } - } else if ( c1 == 'N' || c1 == 'n'){ - /* SS2 */ - c3 = (*i_getc)(f); /* skip SS2 */ - if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){ - c1 = c3; - c2 = JIS_X_0201; - SEND; - }else{ - (*i_ungetc)(c3, f); - /* lonely ESC */ - (*oconv)(0, ESC); - SEND; - } - } else { - /* lonely ESC */ - (*oconv)(0, ESC); - SEND; - } - } else if (c1 == ESC && iconv == s_iconv) { - /* ESC in Shift_JIS */ - if ((c1 = (*i_getc)(f)) == EOF) { - /* (*oconv)(0, ESC); don't send bogus code */ - LAST; - } else if (c1 == '$') { - /* J-PHONE emoji */ - if ((c1 = (*i_getc)(f)) == EOF) { - /* - (*oconv)(0, ESC); don't send bogus code - (*oconv)(0, '$'); */ - LAST; - } else { - if (('E' <= c1 && c1 <= 'G') || - ('O' <= c1 && c1 <= 'Q')) { - /* - NUM : 0 1 2 3 4 5 - BYTE: G E F O P Q - C%7 : 1 6 0 2 3 4 - C%7 : 0 1 2 3 4 5 6 - NUM : 2 0 3 4 5 X 1 - */ - static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1}; - c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE; - while ((c1 = (*i_getc)(f)) != EOF) { - if (SP <= c1 && c1 <= 'z') { - (*oconv)(0, c1 + c0); - } else break; /* c1 == SO */ - } - } - } - if (c1 == EOF) LAST; - NEXT; - } else { - /* lonely ESC */ - (*oconv)(0, ESC); - SEND; - } - } else if (c1 == LF || c1 == CR) { - if (broken_f&4) { - input_mode = ASCII; set_iconv(FALSE, 0); - SEND; - } else if (mime_decode_f && !mime_decode_mode){ - if (c1 == LF) { - if ((c1=(*i_getc)(f))!=EOF && c1 == SP) { - i_ungetc(SP,f); - continue; - } else { - i_ungetc(c1,f); - } - c1 = LF; - SEND; - } else { /* if (c1 == CR)*/ - if ((c1=(*i_getc)(f))!=EOF) { - if (c1==SP) { - i_ungetc(SP,f); - continue; - } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) { - i_ungetc(SP,f); - continue; - } else { - i_ungetc(c1,f); - } - i_ungetc(LF,f); - } else { - i_ungetc(c1,f); - } - c1 = CR; - SEND; - } - } - } else if (c1 == DEL && input_mode == JIS_X_0208) { - /* CP5022x */ - c2 = c1; - NEXT; - } else - SEND; - } - /* send: */ - switch(input_mode){ - case ASCII: - switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */ - case -2: - /* 4 bytes UTF-8 */ - if ((c0 = (*i_getc)(f)) != EOF) { - code_status(c0); - c0 <<= 8; - if ((c3 = (*i_getc)(f)) != EOF) { - code_status(c3); - (*iconv)(c2, c1, c0|c3); - } - } - break; - case -1: - /* 3 bytes EUC or UTF-8 */ - if ((c0 = (*i_getc)(f)) != EOF) { - code_status(c0); - (*iconv)(c2, c1, c0); - } - break; - } - break; - case JIS_X_0208: - case JIS_X_0213_1: - if (ms_ucs_map_f && - 0x7F <= c2 && c2 <= 0x92 && - 0x21 <= c1 && c1 <= 0x7E) { - /* CP932 UDC */ - if(c1 == 0x7F) return 0; - c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE; - c2 = 0; - } - (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */ - break; -#ifdef X0212_ENABLE - case JIS_X_0212: - (*oconv)(PREFIX_EUCG3 | c2, c1); - break; -#endif /* X0212_ENABLE */ - case JIS_X_0213_2: - (*oconv)(PREFIX_EUCG3 | c2, c1); - break; - default: - (*oconv)(input_mode, c1); /* other special case */ - } + return ret; +} - c2 = 0; - c0 = 0; - continue; - /* goto next_word */ - } +#ifdef UTF8_OUTPUT_ENABLE +static nkf_char +e2w_conv(nkf_char c2, nkf_char c1) +{ + const unsigned short *p; - /* epilogue */ - (*iconv)(EOF, 0, 0); - if (!input_codename) - { - if (is_8bit) { - struct input_code *p = input_code_list; - struct input_code *result = p; - while (p->name){ - if (p->score < result->score) result = p; - ++p; + if (c2 == JIS_X_0201_1976_K) { + if (ms_ucs_map_f == UCS_MAP_CP10001) { + switch (c1) { + case 0x20: + return 0xA0; + case 0x7D: + return 0xA9; } - set_input_codename(result->name); -#ifdef CHECK_OPTION - debug(result->name); -#endif } + p = euc_to_utf8_1byte; +#ifdef X0212_ENABLE + } else if (is_eucg3(c2)){ + if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){ + return 0xA6; + } + c2 = (c2&0x7f) - 0x21; + if (0<=c2 && c2name){ - if (p->status_func && p->score < result->score){ - result = p; - } - ++p; - } - set_iconv(TRUE, result->iconv_func); - } - - - /** now, - ** 1) EOF is detected, or - ** 2) Code is established, or - ** 3) Buffer is FULL (but last word is pushed) - ** - ** in 1) and 3) cases, we continue to use - ** Kanji codes by oconv and leave estab_f unchanged. - **/ + nkf_char ret = 0; - ret = c1; - hold_index = 0; - while (hold_index < hold_count){ - c2 = hold_buf[hold_index++]; - if (c2 <= DEL + if (!c1){ + *p2 = 0; + *p1 = c2; + }else if (0xc0 <= c2 && c2 <= 0xef) { + ret = unicode_to_jis_common(c2, c1, c0, p2, p1); #ifdef NUMCHAR_OPTION - || is_unicode_capsule(c2) -#endif - ){ - (*iconv)(0, c2, 0); - continue; - }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){ - (*iconv)(JIS_X_0201, c2, 0); - continue; - } - if (hold_index < hold_count){ - c1 = hold_buf[hold_index++]; - }else{ - c1 = (*i_getc)(f); - if (c1 == EOF){ - c3 = EOF; - break; - } - code_status(c1); - } - c0 = 0; - switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */ - case -2: - /* 4 bytes UTF-8 */ - if (hold_index < hold_count){ - c0 = hold_buf[hold_index++]; - } else if ((c0 = (*i_getc)(f)) == EOF) { - ret = EOF; - break; - } else { - code_status(c0); - c0 <<= 8; - if (hold_index < hold_count){ - c3 = hold_buf[hold_index++]; - } else if ((c3 = (*i_getc)(f)) == EOF) { - c0 = ret = EOF; - break; - } else { - code_status(c3); - (*iconv)(c2, c1, c0|c3); - } - } - break; - case -1: - /* 3 bytes EUC or UTF-8 */ - if (hold_index < hold_count){ - c0 = hold_buf[hold_index++]; - } else if ((c0 = (*i_getc)(f)) == EOF) { - ret = EOF; - break; - } else { - code_status(c0); - } - (*iconv)(c2, c1, c0); - break; + if (ret > 0){ + if (p2) *p2 = 0; + if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0)); + ret = 0; } - if (c0 == EOF) break; +#endif } return ret; } -nkf_char push_hold_buf(nkf_char c2) -{ - if (hold_count >= HOLD_SIZE*2) - return (EOF); - hold_buf[hold_count++] = (unsigned char)c2; - return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); -} - -nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) +#ifdef UTF8_INPUT_ENABLE +static nkf_char +w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1) { -#if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE) - nkf_char val; -#endif - static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} }; -#ifdef SHIFTJIS_CP932 - if (!cp932inv_f && is_ibmext_in_sjis(c2)){ - val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40]; - if (val){ - c2 = val >> 8; - c1 = val & 0xff; - } - } - if (cp932inv_f - && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){ - nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40]; - if (c){ - c2 = c >> 8; - c1 = c & 0xff; - } - } -#endif /* SHIFTJIS_CP932 */ -#ifdef X0212_ENABLE - if (!x0213_f && is_ibmext_in_sjis(c2)){ - val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40]; - if (val){ - if (val > 0x7FFF){ - c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f); - c1 = val & 0xff; - }else{ - c2 = val >> 8; - c1 = val & 0xff; - } - if (p2) *p2 = c2; - if (p1) *p1 = c1; - return 0; - } + nkf_char c1, c2, c3, c4; + nkf_char ret = 0; + val &= VALUE_MASK; + if (val < 0x80) { + *p2 = 0; + *p1 = val; } -#endif - if(c2 >= 0x80){ - if(x0213_f && c2 >= 0xF0){ - if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */ - c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1]; - }else{ /* 78<=k<=94 */ - c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B); - if (0x9E < c1) c2++; - } - }else{ - c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394); - if (0x9E < c1) c2++; - } - if (c1 < 0x9F) - c1 = c1 - ((c1 > DEL) ? SP : 0x1F); - else { - c1 = c1 - 0x7E; + else if (nkf_char_unicode_bmp_p(val)){ + nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); + ret = unicode_to_jis_common(c1, c2, c3, p2, p1); + if (ret > 0){ + *p2 = 0; + *p1 = nkf_char_unicode_new(val); + ret = 0; } } - -#ifdef X0212_ENABLE - c2 = x0212_unshift(c2); -#endif - if (p2) *p2 = c2; - if (p1) *p1 = c1; - return 0; + else { + *p2 = 0; + *p1 = nkf_char_unicode_new(val); + } + return ret; } +#endif -nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0) +static nkf_char +e_iconv(nkf_char c2, nkf_char c1, nkf_char c0) { - if (c2 == JIS_X_0201) { - c1 &= 0x7f; - } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) { - /* NOP */ - } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) { - /* CP932 UDC */ - if(c1 == 0x7F) return 0; - c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE; - c2 = 0; - } else { - nkf_char ret = s2e_conv(c2, c1, &c2, &c1); - if (ret) return ret; - } - (*oconv)(c2, c1); - return 0; -} - -nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0) -{ - if (c2 == JIS_X_0201) { - c1 &= 0x7f; + if (c2 == JIS_X_0201_1976_K || c2 == SS2){ + if (iso2022jp_f && !x0201_f) { + c2 = GETA1; c1 = GETA2; + } else { + c2 = JIS_X_0201_1976_K; + c1 &= 0x7f; + } #ifdef X0212_ENABLE }else if (c2 == 0x8f){ - if (c0 == 0){ - return -1; - } + if (c0 == 0){ + return -1; + } if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) { /* encoding is eucJP-ms, so invert to Unicode Private User Area */ - c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE; + c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC); c2 = 0; } else { c2 = (c2 << 8) | (c1 & 0x7f); @@ -3454,17 +2047,14 @@ nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0) } } #endif /* SHIFTJIS_CP932 */ - } + } #endif /* X0212_ENABLE */ - } else if (c2 == SSO){ - c2 = JIS_X_0201; - c1 &= 0x7f; - } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) { - /* NOP */ + } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) { + /* NOP */ } else { if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) { /* encoding is eucJP-ms, so invert to Unicode Private User Area */ - c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE; + c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000); c2 = 0; } else { c1 &= 0x7f; @@ -3481,36 +2071,40 @@ nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0) } } #endif /* SHIFTJIS_CP932 */ - } + } } (*oconv)(c2, c1); return 0; } -#ifdef UTF8_INPUT_ENABLE -nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1) +static nkf_char +s_iconv(nkf_char c2, nkf_char c1, nkf_char c0) { - nkf_char ret = 0; - - if (!c1){ - *p2 = 0; - *p1 = c2; - }else if (0xc0 <= c2 && c2 <= 0xef) { - ret = unicode_to_jis_common(c2, c1, c0, p2, p1); -#ifdef NUMCHAR_OPTION - if (ret > 0){ - if (p2) *p2 = 0; - if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0); - ret = 0; - } -#endif + if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) { + if (iso2022jp_f && !x0201_f) { + c2 = GETA1; c1 = GETA2; + } else { + c1 &= 0x7f; + } + } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) { + /* NOP */ + } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) { + /* CP932 UDC */ + if(c1 == 0x7F) return 0; + c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000); + c2 = 0; + } else { + nkf_char ret = s2e_conv(c2, c1, &c2, &c1); + if (ret) return ret; } - return ret; + (*oconv)(c2, c1); + return 0; } -nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0) +static nkf_char +w_iconv(nkf_char c1, nkf_char c2, nkf_char c3) { - nkf_char ret = 0; + nkf_char ret = 0, c4 = 0; static const char w_iconv_utf8_1st_byte[] = { /* 0xC0 - 0xFF */ 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, @@ -3518,45 +2112,50 @@ nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0) 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70}; - if (c2 < 0 || 0xff < c2) { - }else if (c2 == 0) { /* 0 : 1 byte*/ - c0 = 0; - } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */ + if (c3 > 0xFF) { + c4 = c3 & 0xFF; + c3 >>= 8; + } + + if (c1 < 0 || 0xff < c1) { + }else if (c1 == 0) { /* 0 : 1 byte*/ + c3 = 0; + } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */ return 0; } else{ - switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) { + switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) { case 21: - if (c1 < 0x80 || 0xBF < c1) return 0; + if (c2 < 0x80 || 0xBF < c2) return 0; break; case 30: - if (c0 == 0) return -1; - if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80) + if (c3 == 0) return -1; + if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80) return 0; break; case 31: case 33: - if (c0 == 0) return -1; - if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80) + if (c3 == 0) return -1; + if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80) return 0; break; case 32: - if (c0 == 0) return -1; - if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80) + if (c3 == 0) return -1; + if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80) return 0; break; case 40: - if (c0 == 0) return -2; - if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080) + if (c3 == 0) return -2; + if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) return 0; break; case 41: - if (c0 == 0) return -2; - if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080) + if (c3 == 0) return -2; + if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) return 0; break; case 42: - if (c0 == 0) return -2; - if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080) + if (c3 == 0) return -2; + if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80) return 0; break; default: @@ -3564,1215 +2163,1448 @@ nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0) break; } } - if (c2 == 0 || c2 == EOF){ - } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */ - c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0); - c2 = 0; + if (c1 == 0 || c1 == EOF){ + } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */ + c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4)); + c1 = 0; } else { - ret = w2e_conv(c2, c1, c0, &c2, &c1); + ret = w2e_conv(c1, c2, c3, &c1, &c2); } if (ret == 0){ - (*oconv)(c2, c1); + (*oconv)(c1, c2); } return ret; } -#endif -#if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) -void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0) +#define NKF_ICONV_INVALID_CODE_RANGE -13 +static size_t +unicode_iconv(nkf_char wc) { - val &= VALUE_MASK; - if (val < 0x80){ - *p2 = val; - *p1 = 0; - *p0 = 0; - }else if (val < 0x800){ - *p2 = 0xc0 | (val >> 6); - *p1 = 0x80 | (val & 0x3f); - *p0 = 0; - } else if (val <= NKF_INT32_C(0xFFFF)) { - *p2 = 0xe0 | (val >> 12); - *p1 = 0x80 | ((val >> 6) & 0x3f); - *p0 = 0x80 | (val & 0x3f); - } else if (val <= NKF_INT32_C(0x10FFFF)) { - *p2 = 0xe0 | (val >> 16); - *p1 = 0x80 | ((val >> 12) & 0x3f); - *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f); + nkf_char c1, c2; + int ret = 0; + + if (wc < 0x80) { + c2 = 0; + c1 = wc; + }else if ((wc>>11) == 27) { + /* unpaired surrogate */ + return NKF_ICONV_INVALID_CODE_RANGE; + }else if (wc < 0xFFFF) { + ret = w16e_conv(wc, &c2, &c1); + if (ret) return ret; + }else if (wc < 0x10FFFF) { + c2 = 0; + c1 = nkf_char_unicode_new(wc); + } else { + return NKF_ICONV_INVALID_CODE_RANGE; + } + (*oconv)(c2, c1); + return 0; +} + +#define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1 +#define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2 +#define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00)) +static size_t +nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) +{ + nkf_char wc; + + if (c1 == EOF) { + (*oconv)(EOF, 0); + return 0; + } + + if (input_endian == ENDIAN_BIG) { + if (0xD8 <= c1 && c1 <= 0xDB) { + if (0xDC <= c3 && c3 <= 0xDF) { + wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4); + } else return NKF_ICONV_NEED_TWO_MORE_BYTES; + } else { + wc = c1 << 8 | c2; + } } else { - *p2 = 0; - *p1 = 0; - *p0 = 0; + if (0xD8 <= c2 && c2 <= 0xDB) { + if (0xDC <= c4 && c4 <= 0xDF) { + wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3); + } else return NKF_ICONV_NEED_TWO_MORE_BYTES; + } else { + wc = c2 << 8 | c1; + } + } + + return (*unicode_iconv)(wc); +} + +static nkf_char +w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0) +{ + (*oconv)(c2, c1); + return 16; /* different from w_iconv32 */ +} + +static nkf_char +w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0) +{ + (*oconv)(c2, c1); + return 32; /* different from w_iconv16 */ +} + +static size_t +nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4) +{ + nkf_char wc; + + if (c1 == EOF) { + (*oconv)(EOF, 0); + return 0; + } + + switch(input_endian){ + case ENDIAN_BIG: + wc = c2 << 16 | c3 << 8 | c4; + break; + case ENDIAN_LITTLE: + wc = c3 << 16 | c2 << 8 | c1; + break; + case ENDIAN_2143: + wc = c1 << 16 | c4 << 8 | c3; + break; + case ENDIAN_3412: + wc = c4 << 16 | c1 << 8 | c2; + break; + default: + return NKF_ICONV_INVALID_CODE_RANGE; } + + return (*unicode_iconv)(wc); } #endif -#ifdef UTF8_INPUT_ENABLE -nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0) +#define output_ascii_escape_sequence(mode) do { \ + if (output_mode != ASCII && output_mode != ISO_8859_1) { \ + (*o_putc)(ESC); \ + (*o_putc)('('); \ + (*o_putc)(ascii_intro); \ + output_mode = mode; \ + } \ + } while (0) + +static void +output_escape_sequence(int mode) { - nkf_char val; - if (c2 >= 0xf8) { - val = -1; - } else if (c2 >= 0xf0){ - /* c2: 1st, c1: 2nd, c0: 3rd/4th */ - val = (c2 & 0x0f) << 18; - val |= (c1 & 0x3f) << 12; - val |= (c0 & 0x3f00) >> 2; - val |= (c0 & 0x3f); - }else if (c2 >= 0xe0){ - val = (c2 & 0x0f) << 12; - val |= (c1 & 0x3f) << 6; - val |= (c0 & 0x3f); - }else if (c2 >= 0xc0){ - val = (c2 & 0x1f) << 6; - val |= (c1 & 0x3f); - }else{ - val = c2; + if (output_mode == mode) + return; + switch(mode) { + case ISO_8859_1: + (*o_putc)(ESC); + (*o_putc)('.'); + (*o_putc)('A'); + break; + case JIS_X_0201_1976_K: + (*o_putc)(ESC); + (*o_putc)('('); + (*o_putc)('I'); + break; + case JIS_X_0208: + (*o_putc)(ESC); + (*o_putc)('$'); + (*o_putc)(kanji_intro); + break; + case JIS_X_0212: + (*o_putc)(ESC); + (*o_putc)('$'); + (*o_putc)('('); + (*o_putc)('D'); + break; + case JIS_X_0213_1: + (*o_putc)(ESC); + (*o_putc)('$'); + (*o_putc)('('); + (*o_putc)('Q'); + break; + case JIS_X_0213_2: + (*o_putc)(ESC); + (*o_putc)('$'); + (*o_putc)('('); + (*o_putc)('P'); + break; } - return val; + output_mode = mode; } -nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1) +static void +j_oconv(nkf_char c2, nkf_char c1) { - nkf_char c2, c1, c0; - nkf_char ret = 0; - val &= VALUE_MASK; - if (val < 0x80){ - *p2 = 0; - *p1 = val; - }else{ - w16w_conv(val, &c2, &c1, &c0); - ret = unicode_to_jis_common(c2, c1, c0, p2, p1); #ifdef NUMCHAR_OPTION - if (ret > 0){ - *p2 = 0; - *p1 = CLASS_UNICODE | val; - ret = 0; + if (c2 == 0 && nkf_char_unicode_p(c1)){ + w16e_conv(c1, &c2, &c1); + if (c2 == 0 && nkf_char_unicode_p(c1)){ + c2 = c1 & VALUE_MASK; + if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) { + /* CP5022x UDC */ + c1 &= 0xFFF; + c2 = 0x7F + c1 / 94; + c1 = 0x21 + c1 % 94; + } else { + if (encode_fallback) (*encode_fallback)(c1); + return; + } } + } #endif + if (c2 == 0) { + output_ascii_escape_sequence(ASCII); + (*o_putc)(c1); + } + else if (c2 == EOF) { + output_ascii_escape_sequence(ASCII); + (*o_putc)(EOF); + } + else if (c2 == ISO_8859_1) { + output_ascii_escape_sequence(ISO_8859_1); + (*o_putc)(c1|0x80); + } + else if (c2 == JIS_X_0201_1976_K) { + output_escape_sequence(JIS_X_0201_1976_K); + (*o_putc)(c1); +#ifdef X0212_ENABLE + } else if (is_eucg3(c2)){ + output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212); + (*o_putc)(c2 & 0x7f); + (*o_putc)(c1); +#endif + } else { + if(ms_ucs_map_f + ? c2<0x20 || 0x92>3) == 27) { /* unpaired surrogate */ - /* - return 2; - */ - return 1; - }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1); - if (ret) return ret; - (*oconv)(c2, c1); - return 0; +#ifdef NUMCHAR_OPTION + if (c2 == 0 && nkf_char_unicode_p(c1)){ + w16e_conv(c1, &c2, &c1); + if (c2 == 0 && nkf_char_unicode_p(c1)){ + c2 = c1 & VALUE_MASK; + if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) { + /* CP932 UDC */ + c1 &= 0xFFF; + c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB); + c1 = c1 % 188; + c1 += 0x40 + (c1 > 0x3e); + (*o_putc)(c2); + (*o_putc)(c1); + return; + } else { + if(encode_fallback)(*encode_fallback)(c1); + return; + } + } + } +#endif + if (c2 == EOF) { + (*o_putc)(EOF); + return; + } else if (c2 == 0) { + output_mode = ASCII; + (*o_putc)(c1); + } else if (c2 == JIS_X_0201_1976_K) { + output_mode = SHIFT_JIS; + (*o_putc)(c1|0x80); + } else if (c2 == ISO_8859_1) { + output_mode = ISO_8859_1; + (*o_putc)(c1 | 0x080); +#ifdef X0212_ENABLE + } else if (is_eucg3(c2)){ + output_mode = SHIFT_JIS; + if (e2s_conv(c2, c1, &c2, &c1) == 0){ + (*o_putc)(c2); + (*o_putc)(c1); + } +#endif + } else { + if (!nkf_isprint(c1) || !nkf_isprint(c2)) { + set_iconv(FALSE, 0); + return; /* too late to rescue this char */ + } + output_mode = SHIFT_JIS; + e2s_conv(c2, c1, &c2, &c1); + +#ifdef SHIFTJIS_CP932 + if (cp932inv_f + && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){ + nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40]; + if (c){ + c2 = c >> 8; + c1 = c & 0xff; + } + } +#endif /* SHIFTJIS_CP932 */ + + (*o_putc)(c2); + if (prefix_table[(unsigned char)c1]){ + (*o_putc)(prefix_table[(unsigned char)c1]); + } + (*o_putc)(c1); + } } -nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0) +#ifdef UTF8_OUTPUT_ENABLE +static void +w_oconv(nkf_char c2, nkf_char c1) { - int ret = 0; + nkf_char c3, c4; + nkf_char val; + + if (output_bom_f) { + output_bom_f = FALSE; + (*o_putc)('\357'); + (*o_putc)('\273'); + (*o_putc)('\277'); + } + + if (c2 == EOF) { + (*o_putc)(EOF); + return; + } + + if (c2 == 0 && nkf_char_unicode_p(c1)){ + val = c1 & VALUE_MASK; + nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); + (*o_putc)(c1); + if (c2) (*o_putc)(c2); + if (c3) (*o_putc)(c3); + if (c4) (*o_putc)(c4); + return; + } - if ((c2 == 0 && c1 < 0x80) || c2==EOF) { - } else if (is_unicode_bmp(c1)) { - ret = w16e_conv(c1, &c2, &c1); + if (c2 == 0) { + (*o_putc)(c1); } else { - c2 = 0; - c1 = CLASS_UNICODE | c1; + val = e2w_conv(c2, c1); + if (val){ + nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); + (*o_putc)(c1); + if (c2) (*o_putc)(c2); + if (c3) (*o_putc)(c3); + if (c4) (*o_putc)(c4); + } } - if (ret) return ret; - (*oconv)(c2, c1); - return 0; } -nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1) +static void +w_oconv16(nkf_char c2, nkf_char c1) { - const unsigned short *const *pp; - const unsigned short *const *const *ppp; - static const char no_best_fit_chars_table_C2[] = - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2, - 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1}; - static const char no_best_fit_chars_table_C2_ms[] = - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, - 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0}; - static const char no_best_fit_chars_table_932_C2[] = - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0}; - static const char no_best_fit_chars_table_932_C3[] = - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1}; - nkf_char ret = 0; - - if(c2 < 0x80){ - *p2 = 0; - *p1 = c2; - }else if(c2 < 0xe0){ - if(no_best_fit_chars_f){ - if(ms_ucs_map_f == UCS_MAP_CP932){ - switch(c2){ - case 0xC2: - if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1; - break; - case 0xC3: - if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1; - break; - } - }else if(!cp932inv_f){ - switch(c2){ - case 0xC2: - if(no_best_fit_chars_table_C2[c1&0x3F]) return 1; - break; - case 0xC3: - if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1; - break; - } - }else if(ms_ucs_map_f == UCS_MAP_MS){ - if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1; - }else if(ms_ucs_map_f == UCS_MAP_CP10001){ - switch(c2){ - case 0xC2: - switch(c1){ - case 0xA2: - case 0xA3: - case 0xA5: - case 0xA6: - case 0xAC: - case 0xAF: - case 0xB8: - return 1; - } - break; - } - } + if (output_bom_f) { + output_bom_f = FALSE; + if (output_endian == ENDIAN_LITTLE){ + (*o_putc)(0xFF); + (*o_putc)(0xFE); + }else{ + (*o_putc)(0xFE); + (*o_putc)(0xFF); } - pp = - ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 : - ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms : - ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac : - utf8_to_euc_2bytes; - ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1); - }else if(c0 < 0xF0){ - if(no_best_fit_chars_f){ - if(ms_ucs_map_f == UCS_MAP_CP932){ - if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1; - }else if(ms_ucs_map_f == UCS_MAP_MS){ - switch(c2){ - case 0xE2: - switch(c1){ - case 0x80: - if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1; - break; - case 0x88: - if(c0 == 0x92) return 1; - break; - } - break; - case 0xE3: - if(c1 == 0x80 || c0 == 0x9C) return 1; - break; - } - }else if(ms_ucs_map_f == UCS_MAP_CP10001){ - switch(c2){ - case 0xE3: - switch(c1){ - case 0x82: - if(c0 == 0x94) return 1; - break; - case 0x83: - if(c0 == 0xBB) return 1; - break; - } - break; - } - }else{ - switch(c2){ - case 0xE2: - switch(c1){ - case 0x80: - if(c0 == 0x95) return 1; - break; - case 0x88: - if(c0 == 0xA5) return 1; - break; - } - break; - case 0xEF: - switch(c1){ - case 0xBC: - if(c0 == 0x8D) return 1; - break; - case 0xBD: - if(c0 == 0x9E && !cp932inv_f) return 1; - break; - case 0xBF: - if(0xA0 <= c0 && c0 <= 0xA5) return 1; - break; - } - break; + } + + if (c2 == EOF) { + (*o_putc)(EOF); + return; + } + + if (c2 == 0 && nkf_char_unicode_p(c1)) { + if (nkf_char_unicode_bmp_p(c1)) { + c2 = (c1 >> 8) & 0xff; + c1 &= 0xff; + } else { + c1 &= VALUE_MASK; + if (c1 <= UNICODE_MAX) { + c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ + c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ + if (output_endian == ENDIAN_LITTLE){ + (*o_putc)(c2 & 0xff); + (*o_putc)((c2 >> 8) & 0xff); + (*o_putc)(c1 & 0xff); + (*o_putc)((c1 >> 8) & 0xff); + }else{ + (*o_putc)((c2 >> 8) & 0xff); + (*o_putc)(c2 & 0xff); + (*o_putc)((c1 >> 8) & 0xff); + (*o_putc)(c1 & 0xff); } } + return; } - ppp = - ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 : - ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms : - ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac : - utf8_to_euc_3bytes; - ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1); - }else return -1; -#ifdef SHIFTJIS_CP932 - if (!ret && !cp932inv_f && is_eucg3(*p2)) { - nkf_char s2, s1; - if (e2s_conv(*p2, *p1, &s2, &s1) == 0) { - s2e_conv(s2, s1, p2, p1); + } else if (c2) { + nkf_char val = e2w_conv(c2, c1); + c2 = (val >> 8) & 0xff; + c1 = val & 0xff; + if (!val) return; + } + + if (output_endian == ENDIAN_LITTLE){ + (*o_putc)(c1); + (*o_putc)(c2); + }else{ + (*o_putc)(c2); + (*o_putc)(c1); + } +} + +static void +w_oconv32(nkf_char c2, nkf_char c1) +{ + if (output_bom_f) { + output_bom_f = FALSE; + if (output_endian == ENDIAN_LITTLE){ + (*o_putc)(0xFF); + (*o_putc)(0xFE); + (*o_putc)(0); + (*o_putc)(0); }else{ - ret = 1; + (*o_putc)(0); + (*o_putc)(0); + (*o_putc)(0xFE); + (*o_putc)(0xFF); } } -#endif - return ret; + + if (c2 == EOF) { + (*o_putc)(EOF); + return; + } + + if (c2 == ISO_8859_1) { + c1 |= 0x80; + } else if (c2 == 0 && nkf_char_unicode_p(c1)) { + c1 &= VALUE_MASK; + } else if (c2) { + c1 = e2w_conv(c2, c1); + if (!c1) return; + } + if (output_endian == ENDIAN_LITTLE){ + (*o_putc)( c1 & 0xFF); + (*o_putc)((c1 >> 8) & 0xFF); + (*o_putc)((c1 >> 16) & 0xFF); + (*o_putc)(0); + }else{ + (*o_putc)(0); + (*o_putc)((c1 >> 16) & 0xFF); + (*o_putc)((c1 >> 8) & 0xFF); + (*o_putc)( c1 & 0xFF); + } } +#endif -nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1) -{ - nkf_char c2; - const unsigned short *p; - unsigned short val; +#define SCORE_L2 (1) /* Kanji Level 2 */ +#define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */ +#define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */ +#define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */ +#define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */ +#define SCORE_NO_EXIST (SCORE_X0212 << 1) /* Undefined Characters */ +#define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */ +#define SCORE_ERROR (SCORE_iMIME << 1) /* Error */ - if (pp == 0) return 1; +#define SCORE_INIT (SCORE_iMIME) - c1 -= 0x80; - if (c1 < 0 || psize <= c1) return 1; - p = pp[c1]; - if (p == 0) return 1; +static const nkf_char score_table_A0[] = { + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, + SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST, +}; - c0 -= 0x80; - if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1; - val = p[c0]; - if (val == 0) return 1; - if (no_cp932ext_f && ( - (val>>8) == 0x2D || /* NEC special characters */ - val > NKF_INT32_C(0xF300) /* IBM extended characters */ - )) return 1; +static const nkf_char score_table_F0[] = { + SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2, + SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, + SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932, + SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR, +}; - c2 = val >> 8; - if (val > 0x7FFF){ - c2 &= 0x7f; - c2 |= PREFIX_EUCG3; +static void +set_code_score(struct input_code *ptr, nkf_char score) +{ + if (ptr){ + ptr->score |= score; } - if (c2 == SO) c2 = JIS_X_0201; - c1 = val & 0x7f; - if (p2) *p2 = c2; - if (p1) *p1 = c1; - return 0; } -void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c) +static void +clr_code_score(struct input_code *ptr, nkf_char score) { - int shift = 20; - c &= VALUE_MASK; - while(shift >= 0){ - if(c >= 1<= 0){ - (*f)(0, bin2hex(c>>shift)); - shift -= 4; - } - }else{ - shift -= 4; - } + if (ptr){ + ptr->score &= ~score; } - return; } -void encode_fallback_html(nkf_char c) +static void +code_score(struct input_code *ptr) { - (*oconv)(0, '&'); - (*oconv)(0, '#'); - c &= VALUE_MASK; - if(c >= NKF_INT32_C(1000000)) - (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10); - if(c >= NKF_INT32_C(100000)) - (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10); - if(c >= 10000) - (*oconv)(0, 0x30+(c/10000 )%10); - if(c >= 1000) - (*oconv)(0, 0x30+(c/1000 )%10); - if(c >= 100) - (*oconv)(0, 0x30+(c/100 )%10); - if(c >= 10) - (*oconv)(0, 0x30+(c/10 )%10); - if(c >= 0) - (*oconv)(0, 0x30+ c %10); - (*oconv)(0, ';'); - return; + nkf_char c2 = ptr->buf[0]; +#ifdef UTF8_OUTPUT_ENABLE + nkf_char c1 = ptr->buf[1]; +#endif + if (c2 < 0){ + set_code_score(ptr, SCORE_ERROR); + }else if (c2 == SS2){ + set_code_score(ptr, SCORE_KANA); + }else if (c2 == 0x8f){ + set_code_score(ptr, SCORE_X0212); +#ifdef UTF8_OUTPUT_ENABLE + }else if (!e2w_conv(c2, c1)){ + set_code_score(ptr, SCORE_NO_EXIST); +#endif + }else if ((c2 & 0x70) == 0x20){ + set_code_score(ptr, score_table_A0[c2 & 0x0f]); + }else if ((c2 & 0x70) == 0x70){ + set_code_score(ptr, score_table_F0[c2 & 0x0f]); + }else if ((c2 & 0x70) >= 0x50){ + set_code_score(ptr, SCORE_L2); + } } -void encode_fallback_xml(nkf_char c) +static void +status_disable(struct input_code *ptr) { - (*oconv)(0, '&'); - (*oconv)(0, '#'); - (*oconv)(0, 'x'); - nkf_each_char_to_hex(oconv, c); - (*oconv)(0, ';'); - return; + ptr->stat = -1; + ptr->buf[0] = -1; + code_score(ptr); + if (iconv == ptr->iconv_func) set_iconv(FALSE, 0); } -void encode_fallback_java(nkf_char c) +static void +status_push_ch(struct input_code *ptr, nkf_char c) { - (*oconv)(0, '\\'); - c &= VALUE_MASK; - if(!is_unicode_bmp(c)){ - (*oconv)(0, 'U'); - (*oconv)(0, '0'); - (*oconv)(0, '0'); - (*oconv)(0, bin2hex(c>>20)); - (*oconv)(0, bin2hex(c>>16)); - }else{ - (*oconv)(0, 'u'); - } - (*oconv)(0, bin2hex(c>>12)); - (*oconv)(0, bin2hex(c>> 8)); - (*oconv)(0, bin2hex(c>> 4)); - (*oconv)(0, bin2hex(c )); - return; + ptr->buf[ptr->index++] = c; } -void encode_fallback_perl(nkf_char c) +static void +status_clear(struct input_code *ptr) { - (*oconv)(0, '\\'); - (*oconv)(0, 'x'); - (*oconv)(0, '{'); - nkf_each_char_to_hex(oconv, c); - (*oconv)(0, '}'); - return; + ptr->stat = 0; + ptr->index = 0; } -void encode_fallback_subchar(nkf_char c) +static void +status_reset(struct input_code *ptr) { - c = unicode_subchar; - (*oconv)((c>>8)&0xFF, c&0xFF); - return; + status_clear(ptr); + ptr->score = SCORE_INIT; } -#endif -#ifdef UTF8_OUTPUT_ENABLE -nkf_char e2w_conv(nkf_char c2, nkf_char c1) +static void +status_reinit(struct input_code *ptr) { - const unsigned short *p; + status_reset(ptr); + ptr->_file_stat = 0; +} - if (c2 == JIS_X_0201) { - if (ms_ucs_map_f == UCS_MAP_CP10001) { - switch (c1) { - case 0x20: - return 0xA0; - case 0x7D: - return 0xA9; +static void +status_check(struct input_code *ptr, nkf_char c) +{ + if (c <= DEL && estab_f){ + status_reset(ptr); + } +} + +static void +s_status(struct input_code *ptr, nkf_char c) +{ + switch(ptr->stat){ + case -1: + status_check(ptr, c); + break; + case 0: + if (c <= DEL){ + break; + }else if (nkf_char_unicode_p(c)){ + break; + }else if (0xa1 <= c && c <= 0xdf){ + status_push_ch(ptr, SS2); + status_push_ch(ptr, c); + code_score(ptr); + status_clear(ptr); + }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){ + ptr->stat = 1; + status_push_ch(ptr, c); + }else if (0xed <= c && c <= 0xee){ + ptr->stat = 3; + status_push_ch(ptr, c); +#ifdef SHIFTJIS_CP932 + }else if (is_ibmext_in_sjis(c)){ + ptr->stat = 2; + status_push_ch(ptr, c); +#endif /* SHIFTJIS_CP932 */ +#ifdef X0212_ENABLE + }else if (0xf0 <= c && c <= 0xfc){ + ptr->stat = 1; + status_push_ch(ptr, c); +#endif /* X0212_ENABLE */ + }else{ + status_disable(ptr); + } + break; + case 1: + if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){ + status_push_ch(ptr, c); + s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]); + code_score(ptr); + status_clear(ptr); + }else{ + status_disable(ptr); + } + break; + case 2: +#ifdef SHIFTJIS_CP932 + if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) { + status_push_ch(ptr, c); + if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) { + set_code_score(ptr, SCORE_CP932); + status_clear(ptr); + break; } } - p = euc_to_utf8_1byte; -#ifdef X0212_ENABLE - } else if (is_eucg3(c2)){ - if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){ - return 0xA6; +#endif /* SHIFTJIS_CP932 */ + status_disable(ptr); + break; + case 3: + if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){ + status_push_ch(ptr, c); + s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]); + set_code_score(ptr, SCORE_CP932); + status_clear(ptr); + }else{ + status_disable(ptr); } - c2 = (c2&0x7f) - 0x21; - if (0<=c2 && c2stat){ + case -1: + status_check(ptr, c); + break; + case 0: + if (c <= DEL){ + break; + }else if (nkf_char_unicode_p(c)){ + break; + }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){ + ptr->stat = 1; + status_push_ch(ptr, c); +#ifdef X0212_ENABLE + }else if (0x8f == c){ + ptr->stat = 2; + status_push_ch(ptr, c); +#endif /* X0212_ENABLE */ + }else{ + status_disable(ptr); + } + break; + case 1: + if (0xa1 <= c && c <= 0xfe){ + status_push_ch(ptr, c); + code_score(ptr); + status_clear(ptr); + }else{ + status_disable(ptr); + } + break; +#ifdef X0212_ENABLE + case 2: + if (0xa1 <= c && c <= 0xfe){ + ptr->stat = 1; + status_push_ch(ptr, c); + }else{ + status_disable(ptr); + } +#endif /* X0212_ENABLE */ } +} - if (c2 == EOF) { - (*o_putc)(EOF); - return; +#ifdef UTF8_INPUT_ENABLE +static void +w_status(struct input_code *ptr, nkf_char c) +{ + switch (ptr->stat){ + case -1: + status_check(ptr, c); + break; + case 0: + if (c <= DEL){ + break; + }else if (nkf_char_unicode_p(c)){ + break; + }else if (0xc0 <= c && c <= 0xdf){ + ptr->stat = 1; + status_push_ch(ptr, c); + }else if (0xe0 <= c && c <= 0xef){ + ptr->stat = 2; + status_push_ch(ptr, c); + }else if (0xf0 <= c && c <= 0xf4){ + ptr->stat = 3; + status_push_ch(ptr, c); + }else{ + status_disable(ptr); + } + break; + case 1: + case 2: + if (0x80 <= c && c <= 0xbf){ + status_push_ch(ptr, c); + if (ptr->index > ptr->stat){ + int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb + && ptr->buf[2] == 0xbf); + w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2], + &ptr->buf[0], &ptr->buf[1]); + if (!bom){ + code_score(ptr); + } + status_clear(ptr); + } + }else{ + status_disable(ptr); + } + break; + case 3: + if (0x80 <= c && c <= 0xbf){ + if (ptr->index < ptr->stat){ + status_push_ch(ptr, c); + } else { + status_clear(ptr); + } + }else{ + status_disable(ptr); + } + break; } +} +#endif -#ifdef NUMCHAR_OPTION - if (c2 == 0 && is_unicode_capsule(c1)){ - val = c1 & VALUE_MASK; - if (val < 0x80){ - (*o_putc)(val); - }else if (val < 0x800){ - (*o_putc)(0xC0 | (val >> 6)); - (*o_putc)(0x80 | (val & 0x3f)); - } else if (val <= NKF_INT32_C(0xFFFF)) { - (*o_putc)(0xE0 | (val >> 12)); - (*o_putc)(0x80 | ((val >> 6) & 0x3f)); - (*o_putc)(0x80 | (val & 0x3f)); - } else if (val <= NKF_INT32_C(0x10FFFF)) { - (*o_putc)(0xF0 | ( val>>18)); - (*o_putc)(0x80 | ((val>>12) & 0x3f)); - (*o_putc)(0x80 | ((val>> 6) & 0x3f)); - (*o_putc)(0x80 | ( val & 0x3f)); - } - return; +static void +code_status(nkf_char c) +{ + int action_flag = 1; + struct input_code *result = 0; + struct input_code *p = input_code_list; + while (p->name){ + if (!p->status_func) { + ++p; + continue; + } + if (!p->status_func) + continue; + (p->status_func)(p, c); + if (p->stat > 0){ + action_flag = 0; + }else if(p->stat == 0){ + if (result){ + action_flag = 0; + }else{ + result = p; + } + } + ++p; } -#endif - if (c2 == 0) { - output_mode = ASCII; - (*o_putc)(c1); - } else if (c2 == ISO_8859_1) { - output_mode = UTF_8; - (*o_putc)(c1 | 0x080); - } else { - output_mode = UTF_8; - val = e2w_conv(c2, c1); - if (val){ - w16w_conv(val, &c2, &c1, &c0); - (*o_putc)(c2); - if (c1){ - (*o_putc)(c1); - if (c0) (*o_putc)(c0); - } - } + if (action_flag){ + if (result && !estab_f){ + set_iconv(TRUE, result->iconv_func); + }else if (c <= DEL){ + struct input_code *ptr = input_code_list; + while (ptr->name){ + status_reset(ptr); + ++ptr; + } + } } } -void w_oconv16(nkf_char c2, nkf_char c1) +typedef struct { + nkf_buf_t *std_gc_buf; + nkf_char broken_state; + nkf_buf_t *broken_buf; + nkf_char mimeout_state; + nkf_buf_t *nfc_buf; +} nkf_state_t; + +static nkf_state_t *nkf_state = NULL; + +#define STD_GC_BUFSIZE (256) + +static void +nkf_state_init(void) +{ + if (nkf_state) { + nkf_buf_clear(nkf_state->std_gc_buf); + nkf_buf_clear(nkf_state->broken_buf); + nkf_buf_clear(nkf_state->nfc_buf); + } + else { + nkf_state = nkf_xmalloc(sizeof(nkf_state_t)); + nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE); + nkf_state->broken_buf = nkf_buf_new(3); + nkf_state->nfc_buf = nkf_buf_new(9); + } + nkf_state->broken_state = 0; + nkf_state->mimeout_state = 0; +} + +#ifndef WIN32DLL +static nkf_char +std_getc(FILE *f) { - if (output_bom_f) { - output_bom_f = FALSE; - if (output_endian == ENDIAN_LITTLE){ - (*o_putc)((unsigned char)'\377'); - (*o_putc)('\376'); - }else{ - (*o_putc)('\376'); - (*o_putc)((unsigned char)'\377'); - } + if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){ + return nkf_buf_pop(nkf_state->std_gc_buf); } + return getc(f); +} +#endif /*WIN32DLL*/ - if (c2 == EOF) { - (*o_putc)(EOF); - return; - } +static nkf_char +std_ungetc(nkf_char c, FILE *f) +{ + nkf_buf_push(nkf_state->std_gc_buf, c); + return c; +} - if (c2 == ISO_8859_1) { - c2 = 0; - c1 |= 0x80; -#ifdef NUMCHAR_OPTION - } else if (c2 == 0 && is_unicode_capsule(c1)) { - if (is_unicode_bmp(c1)) { - c2 = (c1 >> 8) & 0xff; - c1 &= 0xff; - } else { - c1 &= VALUE_MASK; - if (c1 <= UNICODE_MAX) { - c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ - c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ - if (output_endian == ENDIAN_LITTLE){ - (*o_putc)(c2 & 0xff); - (*o_putc)((c2 >> 8) & 0xff); - (*o_putc)(c1 & 0xff); - (*o_putc)((c1 >> 8) & 0xff); - }else{ - (*o_putc)((c2 >> 8) & 0xff); - (*o_putc)(c2 & 0xff); - (*o_putc)((c1 >> 8) & 0xff); - (*o_putc)(c1 & 0xff); - } - } - return; - } -#endif - } else if (c2) { - nkf_char val = e2w_conv(c2, c1); - c2 = (val >> 8) & 0xff; - c1 = val & 0xff; - if (!val) return; - } - if (output_endian == ENDIAN_LITTLE){ - (*o_putc)(c1); - (*o_putc)(c2); - }else{ - (*o_putc)(c2); - (*o_putc)(c1); - } +#ifndef WIN32DLL +static void +std_putc(nkf_char c) +{ + if(c!=EOF) + putchar(c); } +#endif /*WIN32DLL*/ -void w_oconv32(nkf_char c2, nkf_char c1) +static nkf_char hold_buf[HOLD_SIZE*2]; +static int hold_count = 0; +static nkf_char +push_hold_buf(nkf_char c2) { - if (output_bom_f) { - output_bom_f = FALSE; - if (output_endian == ENDIAN_LITTLE){ - (*o_putc)((unsigned char)'\377'); - (*o_putc)('\376'); - (*o_putc)('\000'); - (*o_putc)('\000'); - }else{ - (*o_putc)('\000'); - (*o_putc)('\000'); - (*o_putc)('\376'); - (*o_putc)((unsigned char)'\377'); - } - } + if (hold_count >= HOLD_SIZE*2) + return (EOF); + hold_buf[hold_count++] = c2; + return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); +} - if (c2 == EOF) { - (*o_putc)(EOF); - return; - } +static int +h_conv(FILE *f, nkf_char c1, nkf_char c2) +{ + int ret; + int hold_index; + nkf_char c3, c4; - if (c2 == ISO_8859_1) { - c1 |= 0x80; -#ifdef NUMCHAR_OPTION - } else if (c2 == 0 && is_unicode_capsule(c1)) { - c1 &= VALUE_MASK; -#endif - } else if (c2) { - c1 = e2w_conv(c2, c1); - if (!c1) return; + /** it must NOT be in the kanji shifte sequence */ + /** it must NOT be written in JIS7 */ + /** and it must be after 2 byte 8bit code */ + + hold_count = 0; + push_hold_buf(c1); + push_hold_buf(c2); + + while ((c2 = (*i_getc)(f)) != EOF) { + if (c2 == ESC){ + (*i_ungetc)(c2,f); + break; + } + code_status(c2); + if (push_hold_buf(c2) == EOF || estab_f) { + break; + } } - if (output_endian == ENDIAN_LITTLE){ - (*o_putc)( c1 & NKF_INT32_C(0x000000FF)); - (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8); - (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16); - (*o_putc)('\000'); - }else{ - (*o_putc)('\000'); - (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16); - (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8); - (*o_putc)( c1 & NKF_INT32_C(0x000000FF)); + + if (!estab_f) { + struct input_code *p = input_code_list; + struct input_code *result = p; + if (c2 == EOF) { + code_status(c2); + } + while (p->name) { + if (p->status_func && p->score < result->score) { + result = p; + } + p++; + } + set_iconv(TRUE, result->iconv_func); } -} -#endif -void e_oconv(nkf_char c2, nkf_char c1) + + /** now, + ** 1) EOF is detected, or + ** 2) Code is established, or + ** 3) Buffer is FULL (but last word is pushed) + ** + ** in 1) and 3) cases, we continue to use + ** Kanji codes by oconv and leave estab_f unchanged. + **/ + + ret = c2; + hold_index = 0; + while (hold_index < hold_count){ + c1 = hold_buf[hold_index++]; + if (nkf_char_unicode_p(c1)) { + (*oconv)(0, c1); + continue; + } + else if (c1 <= DEL){ + (*iconv)(0, c1, 0); + continue; + }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){ + (*iconv)(JIS_X_0201_1976_K, c1, 0); + continue; + } + if (hold_index < hold_count){ + c2 = hold_buf[hold_index++]; + }else{ + c2 = (*i_getc)(f); + if (c2 == EOF){ + c4 = EOF; + break; + } + code_status(c2); + } + c3 = 0; + switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */ + case -2: + /* 4 bytes UTF-8 */ + if (hold_index < hold_count){ + c3 = hold_buf[hold_index++]; + } else if ((c3 = (*i_getc)(f)) == EOF) { + ret = EOF; + break; + } + code_status(c3); + if (hold_index < hold_count){ + c4 = hold_buf[hold_index++]; + } else if ((c4 = (*i_getc)(f)) == EOF) { + c3 = ret = EOF; + break; + } + code_status(c4); + (*iconv)(c1, c2, (c3<<8)|c4); + break; + case -1: + /* 3 bytes EUC or UTF-8 */ + if (hold_index < hold_count){ + c3 = hold_buf[hold_index++]; + } else if ((c3 = (*i_getc)(f)) == EOF) { + ret = EOF; + break; + } else { + code_status(c3); + } + (*iconv)(c1, c2, c3); + break; + } + if (c3 == EOF) break; + } + return ret; +} + +/* + * Check and Ignore BOM + */ +static void +check_bom(FILE *f) { -#ifdef NUMCHAR_OPTION - if (c2 == 0 && is_unicode_capsule(c1)){ - w16e_conv(c1, &c2, &c1); - if (c2 == 0 && is_unicode_capsule(c1)){ - c2 = c1 & VALUE_MASK; - if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) { - /* eucJP-ms UDC */ - c1 &= 0xFFF; - c2 = c1 / 94; - c2 += c2 < 10 ? 0x75 : 0x8FEB; - c1 = 0x21 + c1 % 94; - if (is_eucg3(c2)){ - (*o_putc)(0x8f); - (*o_putc)((c2 & 0x7f) | 0x080); - (*o_putc)(c1 | 0x080); - }else{ - (*o_putc)((c2 & 0x7f) | 0x080); - (*o_putc)(c1 | 0x080); + int c2; + switch(c2 = (*i_getc)(f)){ + case 0x00: + if((c2 = (*i_getc)(f)) == 0x00){ + if((c2 = (*i_getc)(f)) == 0xFE){ + if((c2 = (*i_getc)(f)) == 0xFF){ + if(!input_encoding){ + set_iconv(TRUE, w_iconv32); + } + if (iconv == w_iconv32) { + input_endian = ENDIAN_BIG; + return; + } + (*i_ungetc)(0xFF,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0xFE,f); + }else if(c2 == 0xFF){ + if((c2 = (*i_getc)(f)) == 0xFE){ + if(!input_encoding){ + set_iconv(TRUE, w_iconv32); + } + if (iconv == w_iconv32) { + input_endian = ENDIAN_2143; + return; + } + (*i_ungetc)(0xFF,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0xFF,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0x00,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0x00,f); + break; + case 0xEF: + if((c2 = (*i_getc)(f)) == 0xBB){ + if((c2 = (*i_getc)(f)) == 0xBF){ + if(!input_encoding){ + set_iconv(TRUE, w_iconv); + } + if (iconv == w_iconv) { + return; } + (*i_ungetc)(0xBF,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0xBB,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0xEF,f); + break; + case 0xFE: + if((c2 = (*i_getc)(f)) == 0xFF){ + if((c2 = (*i_getc)(f)) == 0x00){ + if((c2 = (*i_getc)(f)) == 0x00){ + if(!input_encoding){ + set_iconv(TRUE, w_iconv32); + } + if (iconv == w_iconv32) { + input_endian = ENDIAN_3412; + return; + } + (*i_ungetc)(0x00,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0x00,f); + }else (*i_ungetc)(c2,f); + if(!input_encoding){ + set_iconv(TRUE, w_iconv16); + } + if (iconv == w_iconv16) { + input_endian = ENDIAN_BIG; return; - } else { - if (encode_fallback) (*encode_fallback)(c1); + } + (*i_ungetc)(0xFF,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0xFE,f); + break; + case 0xFF: + if((c2 = (*i_getc)(f)) == 0xFE){ + if((c2 = (*i_getc)(f)) == 0x00){ + if((c2 = (*i_getc)(f)) == 0x00){ + if(!input_encoding){ + set_iconv(TRUE, w_iconv32); + } + if (iconv == w_iconv32) { + input_endian = ENDIAN_LITTLE; + return; + } + (*i_ungetc)(0x00,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0x00,f); + }else (*i_ungetc)(c2,f); + if(!input_encoding){ + set_iconv(TRUE, w_iconv16); + } + if (iconv == w_iconv16) { + input_endian = ENDIAN_LITTLE; return; } - } - } -#endif - if (c2 == EOF) { - (*o_putc)(EOF); - return; - } else if (c2 == 0) { - output_mode = ASCII; - (*o_putc)(c1); - } else if (c2 == JIS_X_0201) { - output_mode = EUC_JP; - (*o_putc)(SSO); (*o_putc)(c1|0x80); - } else if (c2 == ISO_8859_1) { - output_mode = ISO_8859_1; - (*o_putc)(c1 | 0x080); -#ifdef X0212_ENABLE - } else if (is_eucg3(c2)){ - output_mode = EUC_JP; -#ifdef SHIFTJIS_CP932 - if (!cp932inv_f){ - nkf_char s2, s1; - if (e2s_conv(c2, c1, &s2, &s1) == 0){ - s2e_conv(s2, s1, &c2, &c1); - } - } -#endif - if (c2 == 0) { - output_mode = ASCII; - (*o_putc)(c1); - }else if (is_eucg3(c2)){ - if (x0212_f){ - (*o_putc)(0x8f); - (*o_putc)((c2 & 0x7f) | 0x080); - (*o_putc)(c1 | 0x080); - } - }else{ - (*o_putc)((c2 & 0x7f) | 0x080); - (*o_putc)(c1 | 0x080); - } -#endif - } else { - if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) { - set_iconv(FALSE, 0); - return; /* too late to rescue this char */ - } - output_mode = EUC_JP; - (*o_putc)(c2 | 0x080); - (*o_putc)(c1 | 0x080); + (*i_ungetc)(0xFE,f); + }else (*i_ungetc)(c2,f); + (*i_ungetc)(0xFF,f); + break; + default: + (*i_ungetc)(c2,f); + break; } } -#ifdef X0212_ENABLE -nkf_char x0212_shift(nkf_char c) +static nkf_char +broken_getc(FILE *f) { - nkf_char ret = c; - c &= 0x7f; - if (is_eucg3(ret)){ - if (0x75 <= c && c <= 0x7f){ - ret = c + (0x109 - 0x75); - } - }else{ - if (0x75 <= c && c <= 0x7f){ - ret = c + (0x113 - 0x75); - } + nkf_char c, c1; + + if (!nkf_buf_empty_p(nkf_state->broken_buf)) { + return nkf_buf_pop(nkf_state->broken_buf); + } + c = (*i_bgetc)(f); + if (c=='$' && nkf_state->broken_state != ESC + && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) { + c1= (*i_bgetc)(f); + nkf_state->broken_state = 0; + if (c1=='@'|| c1=='B') { + nkf_buf_push(nkf_state->broken_buf, c1); + nkf_buf_push(nkf_state->broken_buf, c); + return ESC; + } else { + (*i_bungetc)(c1,f); + return c; + } + } else if (c=='(' && nkf_state->broken_state != ESC + && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) { + c1= (*i_bgetc)(f); + nkf_state->broken_state = 0; + if (c1=='J'|| c1=='B') { + nkf_buf_push(nkf_state->broken_buf, c1); + nkf_buf_push(nkf_state->broken_buf, c); + return ESC; + } else { + (*i_bungetc)(c1,f); + return c; + } + } else { + nkf_state->broken_state = c; + return c; } - return ret; } +static nkf_char +broken_ungetc(nkf_char c, FILE *f) +{ + if (nkf_buf_length(nkf_state->broken_buf) < 2) + nkf_buf_push(nkf_state->broken_buf, c); + return c; +} -nkf_char x0212_unshift(nkf_char c) +static void +eol_conv(nkf_char c2, nkf_char c1) { - nkf_char ret = c; - if (0x7f <= c && c <= 0x88){ - ret = c + (0x75 - 0x7f); - }else if (0x89 <= c && c <= 0x92){ - ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89)); + if (guess_f && input_eol != EOF) { + if (c2 == 0 && c1 == LF) { + if (!input_eol) input_eol = prev_cr ? CRLF : LF; + else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF; + } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF; + else if (!prev_cr); + else if (!input_eol) input_eol = CR; + else if (input_eol != CR) input_eol = EOF; } - return ret; + if (prev_cr || (c2 == 0 && c1 == LF)) { + prev_cr = 0; + if (eolmode_f != LF) (*o_eol_conv)(0, CR); + if (eolmode_f != CR) (*o_eol_conv)(0, LF); + } + if (c2 == 0 && c1 == CR) prev_cr = CR; + else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1); } -#endif /* X0212_ENABLE */ -nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1) +static void +put_newline(void (*func)(nkf_char)) { - nkf_char ndx; - if (is_eucg3(c2)){ - ndx = c2 & 0x7f; - if (x0213_f){ - if((0x21 <= ndx && ndx <= 0x2F)){ - if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3; - if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); - return 0; - }else if(0x6E <= ndx && ndx <= 0x7E){ - if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe; - if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); - return 0; - } - return 1; - } -#ifdef X0212_ENABLE - else if(nkf_isgraph(ndx)){ - nkf_char val = 0; - const unsigned short *ptr; - ptr = x0212_shiftjis[ndx - 0x21]; - if (ptr){ - val = ptr[(c1 & 0x7f) - 0x21]; - } - if (val){ - c2 = val >> 8; - c1 = val & 0xff; - if (p2) *p2 = c2; - if (p1) *p1 = c1; - return 0; - } - c2 = x0212_shift(c2); - } -#endif /* X0212_ENABLE */ + switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { + case CRLF: + (*func)(0x0D); + (*func)(0x0A); + break; + case CR: + (*func)(0x0D); + break; + case LF: + (*func)(0x0A); + break; } - if(0x7F < c2) return 1; - if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1); - if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e); - return 0; } -void s_oconv(nkf_char c2, nkf_char c1) +static void +oconv_newline(void (*func)(nkf_char, nkf_char)) { -#ifdef NUMCHAR_OPTION - if (c2 == 0 && is_unicode_capsule(c1)){ - w16e_conv(c1, &c2, &c1); - if (c2 == 0 && is_unicode_capsule(c1)){ - c2 = c1 & VALUE_MASK; - if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) { - /* CP932 UDC */ - c1 &= 0xFFF; - c2 = c1 / 188 + 0xF0; - c1 = c1 % 188; - c1 += 0x40 + (c1 > 0x3e); - (*o_putc)(c2); - (*o_putc)(c1); - return; - } else { - if(encode_fallback)(*encode_fallback)(c1); - return; - } - } - } -#endif - if (c2 == EOF) { - (*o_putc)(EOF); - return; - } else if (c2 == 0) { - output_mode = ASCII; - (*o_putc)(c1); - } else if (c2 == JIS_X_0201) { - output_mode = SHIFT_JIS; - (*o_putc)(c1|0x80); - } else if (c2 == ISO_8859_1) { - output_mode = ISO_8859_1; - (*o_putc)(c1 | 0x080); -#ifdef X0212_ENABLE - } else if (is_eucg3(c2)){ - output_mode = SHIFT_JIS; - if (e2s_conv(c2, c1, &c2, &c1) == 0){ - (*o_putc)(c2); - (*o_putc)(c1); - } -#endif - } else { - if (!nkf_isprint(c1) || !nkf_isprint(c2)) { - set_iconv(FALSE, 0); - return; /* too late to rescue this char */ - } - output_mode = SHIFT_JIS; - e2s_conv(c2, c1, &c2, &c1); - -#ifdef SHIFTJIS_CP932 - if (cp932inv_f - && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){ - nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40]; - if (c){ - c2 = c >> 8; - c1 = c & 0xff; - } - } -#endif /* SHIFTJIS_CP932 */ - - (*o_putc)(c2); - if (prefix_table[(unsigned char)c1]){ - (*o_putc)(prefix_table[(unsigned char)c1]); - } - (*o_putc)(c1); - } -} - -void j_oconv(nkf_char c2, nkf_char c1) -{ -#ifdef NUMCHAR_OPTION - if (c2 == 0 && is_unicode_capsule(c1)){ - w16e_conv(c1, &c2, &c1); - if (c2 == 0 && is_unicode_capsule(c1)){ - c2 = c1 & VALUE_MASK; - if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) { - /* CP5022x UDC */ - c1 &= 0xFFF; - c2 = 0x7F + c1 / 94; - c1 = 0x21 + c1 % 94; - } else { - if (encode_fallback) (*encode_fallback)(c1); - return; - } - } - } -#endif - if (c2 == EOF) { - if (output_mode !=ASCII && output_mode!=ISO_8859_1) { - (*o_putc)(ESC); - (*o_putc)('('); - (*o_putc)(ascii_intro); - output_mode = ASCII; - } - (*o_putc)(EOF); -#ifdef X0212_ENABLE - } else if (is_eucg3(c2)){ - if(x0213_f){ - if(output_mode!=JIS_X_0213_2){ - output_mode = JIS_X_0213_2; - (*o_putc)(ESC); - (*o_putc)('$'); - (*o_putc)('('); - (*o_putc)(0x50); - } - }else{ - if(output_mode!=JIS_X_0212){ - output_mode = JIS_X_0212; - (*o_putc)(ESC); - (*o_putc)('$'); - (*o_putc)('('); - (*o_putc)(0x44); - } - } - (*o_putc)(c2 & 0x7f); - (*o_putc)(c1); -#endif - } else if (c2==JIS_X_0201) { - if (output_mode!=JIS_X_0201) { - output_mode = JIS_X_0201; - (*o_putc)(ESC); - (*o_putc)('('); - (*o_putc)('I'); - } - (*o_putc)(c1); - } else if (c2==ISO_8859_1) { - /* iso8859 introduction, or 8th bit on */ - /* Can we convert in 7bit form using ESC-'-'-A ? - Is this popular? */ - output_mode = ISO_8859_1; - (*o_putc)(c1|0x80); - } else if (c2 == 0) { - if (output_mode !=ASCII && output_mode!=ISO_8859_1) { - (*o_putc)(ESC); - (*o_putc)('('); - (*o_putc)(ascii_intro); - output_mode = ASCII; - } - (*o_putc)(c1); - } else { - if(ms_ucs_map_f - ? c2<0x20 || 0x920) { - return broken_buf[--broken_counter]; - } - c= (*i_bgetc)(f); - if (c=='$' && broken_last != ESC - && (input_mode==ASCII || input_mode==JIS_X_0201)) { - c1= (*i_bgetc)(f); - broken_last = 0; - if (c1=='@'|| c1=='B') { - broken_buf[0]=c1; broken_buf[1]=c; - broken_counter=2; - return ESC; - } else { - (*i_bungetc)(c1,f); - return c; - } - } else if (c=='(' && broken_last != ESC - && (input_mode==JIS_X_0208 || input_mode==JIS_X_0201)) { /* ) */ - c1= (*i_bgetc)(f); - broken_last = 0; - if (c1=='J'|| c1=='B') { - broken_buf[0]=c1; broken_buf[1]=c; - broken_counter=2; - return ESC; - } else { - (*i_bungetc)(c1,f); - return c; - } - } else { - broken_last = c; - return c; - } -} - -nkf_char broken_ungetc(nkf_char c, FILE *f) -{ - if (broken_counter<2) - broken_buf[broken_counter++]=c; - return c; -} - -void nl_conv(nkf_char c2, nkf_char c1) -{ - if (guess_f && input_newline != EOF) { - if (c2 == 0 && c1 == LF) { - if (!input_newline) input_newline = prev_cr ? CRLF : LF; - else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF; - } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF; - else if (!prev_cr); - else if (!input_newline) input_newline = CR; - else if (input_newline != CR) input_newline = EOF; - } - if (prev_cr || (c2 == 0 && c1 == LF)) { - prev_cr = 0; - if (nlmode_f != LF) (*o_nlconv)(0, CR); - if (nlmode_f != CR) (*o_nlconv)(0, LF); + switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) { + case CRLF: + (*func)(0, 0x0D); + (*func)(0, 0x0A); + break; + case CR: + (*func)(0, 0x0D); + break; + case LF: + (*func)(0, 0x0A); + break; } - if (c2 == 0 && c1 == CR) prev_cr = CR; - else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1); } /* - Return value of fold_conv() + Return value of fold_conv() - LF add newline and output char - CR add newline and output nothing - SP space - 0 skip - 1 (or else) normal output + LF add newline and output char + CR add newline and output nothing + SP space + 0 skip + 1 (or else) normal output - fold state in prev (previous character) + fold state in prev (previous character) - >0x80 Japanese (X0208/X0201) - <0x80 ASCII - LF new line - SP space + >0x80 Japanese (X0208/X0201) + <0x80 ASCII + LF new line + SP space - This fold algorthm does not preserve heading space in a line. - This is the main difference from fmt. -*/ + This fold algorthm does not preserve heading space in a line. + This is the main difference from fmt. + */ #define char_size(c2,c1) (c2?2:1) -void fold_conv(nkf_char c2, nkf_char c1) +static void +fold_conv(nkf_char c2, nkf_char c1) { nkf_char prev0; nkf_char fold_state; if (c1== CR && !fold_preserve_f) { - fold_state=0; /* ignore cr */ + fold_state=0; /* ignore cr */ }else if (c1== LF&&f_prev==CR && fold_preserve_f) { - f_prev = LF; - fold_state=0; /* ignore cr */ + f_prev = LF; + fold_state=0; /* ignore cr */ } else if (c1== BS) { - if (f_line>0) f_line--; - fold_state = 1; + if (f_line>0) f_line--; + fold_state = 1; } else if (c2==EOF && f_line != 0) { /* close open last line */ - fold_state = LF; + fold_state = LF; } else if ((c1==LF && !fold_preserve_f) - || ((c1==CR||(c1==LF&&f_prev!=CR)) - && fold_preserve_f)) { - /* new line */ - if (fold_preserve_f) { - f_prev = c1; - f_line = 0; - fold_state = CR; + || ((c1==CR||(c1==LF&&f_prev!=CR)) + && fold_preserve_f)) { + /* new line */ + if (fold_preserve_f) { + f_prev = c1; + f_line = 0; + fold_state = CR; } else if ((f_prev == c1 && !fold_preserve_f) - || (f_prev == LF && fold_preserve_f) - ) { /* duplicate newline */ - if (f_line) { - f_line = 0; - fold_state = LF; /* output two newline */ - } else { - f_line = 0; - fold_state = 1; - } - } else { - if (f_prev&0x80) { /* Japanese? */ - f_prev = c1; - fold_state = 0; /* ignore given single newline */ - } else if (f_prev==SP) { - fold_state = 0; - } else { - f_prev = c1; - if (++f_line<=fold_len) - fold_state = SP; - else { - f_line = 0; - fold_state = CR; /* fold and output nothing */ - } - } - } + || (f_prev == LF && fold_preserve_f) + ) { /* duplicate newline */ + if (f_line) { + f_line = 0; + fold_state = LF; /* output two newline */ + } else { + f_line = 0; + fold_state = 1; + } + } else { + if (f_prev&0x80) { /* Japanese? */ + f_prev = c1; + fold_state = 0; /* ignore given single newline */ + } else if (f_prev==SP) { + fold_state = 0; + } else { + f_prev = c1; + if (++f_line<=fold_len) + fold_state = SP; + else { + f_line = 0; + fold_state = CR; /* fold and output nothing */ + } + } + } } else if (c1=='\f') { - f_prev = LF; - f_line = 0; - fold_state = LF; /* output newline and clear */ - } else if ( (c2==0 && c1==SP)|| - (c2==0 && c1==TAB)|| - (c2=='!'&& c1=='!')) { - /* X0208 kankaku or ascii space */ - if (f_prev == SP) { - fold_state = 0; /* remove duplicate spaces */ - } else { - f_prev = SP; - if (++f_line<=fold_len) - fold_state = SP; /* output ASCII space only */ - else { - f_prev = SP; f_line = 0; - fold_state = CR; /* fold and output nothing */ - } - } + f_prev = LF; + f_line = 0; + fold_state = LF; /* output newline and clear */ + } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) { + /* X0208 kankaku or ascii space */ + if (f_prev == SP) { + fold_state = 0; /* remove duplicate spaces */ + } else { + f_prev = SP; + if (++f_line<=fold_len) + fold_state = SP; /* output ASCII space only */ + else { + f_prev = SP; f_line = 0; + fold_state = CR; /* fold and output nothing */ + } + } } else { - prev0 = f_prev; /* we still need this one... , but almost done */ - f_prev = c1; - if (c2 || c2==JIS_X_0201) - f_prev |= 0x80; /* this is Japanese */ - f_line += char_size(c2,c1); - if (f_line<=fold_len) { /* normal case */ - fold_state = 1; - } else { - if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */ - f_line = char_size(c2,c1); - fold_state = LF; /* We can't wait, do fold now */ - } else if (c2==JIS_X_0201) { - /* simple kinsoku rules return 1 means no folding */ - if (c1==(0xde&0x7f)) fold_state = 1; /* ゛*/ - else if (c1==(0xdf&0x7f)) fold_state = 1; /* ゜*/ - else if (c1==(0xa4&0x7f)) fold_state = 1; /* 。*/ - else if (c1==(0xa3&0x7f)) fold_state = 1; /* ,*/ - else if (c1==(0xa1&0x7f)) fold_state = 1; /* 」*/ - else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */ - else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */ + prev0 = f_prev; /* we still need this one... , but almost done */ + f_prev = c1; + if (c2 || c2 == JIS_X_0201_1976_K) + f_prev |= 0x80; /* this is Japanese */ + f_line += char_size(c2,c1); + if (f_line<=fold_len) { /* normal case */ + fold_state = 1; + } else { + if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */ + f_line = char_size(c2,c1); + fold_state = LF; /* We can't wait, do fold now */ + } else if (c2 == JIS_X_0201_1976_K) { + /* simple kinsoku rules return 1 means no folding */ + if (c1==(0xde&0x7f)) fold_state = 1; /* ゛*/ + else if (c1==(0xdf&0x7f)) fold_state = 1; /* ゜*/ + else if (c1==(0xa4&0x7f)) fold_state = 1; /* 。*/ + else if (c1==(0xa3&0x7f)) fold_state = 1; /* ,*/ + else if (c1==(0xa1&0x7f)) fold_state = 1; /* 」*/ + else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */ + else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */ f_line = 1; fold_state = LF;/* add one new f_line before this character */ } else { f_line = 1; fold_state = LF;/* add one new f_line before this character */ } - } else if (c2==0) { - /* kinsoku point in ASCII */ + } else if (c2==0) { + /* kinsoku point in ASCII */ if ( c1==')'|| /* { [ ( */ - c1==']'|| - c1=='}'|| - c1=='.'|| - c1==','|| - c1=='!'|| - c1=='?'|| - c1=='/'|| - c1==':'|| - c1==';') { + c1==']'|| + c1=='}'|| + c1=='.'|| + c1==','|| + c1=='!'|| + c1=='?'|| + c1=='/'|| + c1==':'|| + c1==';') { fold_state = 1; - /* just after special */ + /* just after special */ } else if (!is_alnum(prev0)) { f_line = char_size(c2,c1); fold_state = LF; } else if ((prev0==SP) || /* ignored new f_line */ - (prev0==LF)|| /* ignored new f_line */ - (prev0&0x80)) { /* X0208 - ASCII */ + (prev0==LF)|| /* ignored new f_line */ + (prev0&0x80)) { /* X0208 - ASCII */ f_line = char_size(c2,c1); - fold_state = LF;/* add one new f_line before this character */ - } else { - fold_state = 1; /* default no fold in ASCII */ - } - } else { - if (c2=='!') { - if (c1=='"') fold_state = 1; /* 、 */ - else if (c1=='#') fold_state = 1; /* 。 */ - else if (c1=='W') fold_state = 1; /* 」 */ - else if (c1=='K') fold_state = 1; /* ) */ - else if (c1=='$') fold_state = 1; /* , */ - else if (c1=='%') fold_state = 1; /* . */ - else if (c1=='\'') fold_state = 1; /* + */ - else if (c1=='(') fold_state = 1; /* ; */ - else if (c1==')') fold_state = 1; /* ? */ - else if (c1=='*') fold_state = 1; /* ! */ - else if (c1=='+') fold_state = 1; /* ゛ */ - else if (c1==',') fold_state = 1; /* ゜ */ - /* default no fold in kinsoku */ + fold_state = LF;/* add one new f_line before this character */ + } else { + fold_state = 1; /* default no fold in ASCII */ + } + } else { + if (c2=='!') { + if (c1=='"') fold_state = 1; /* 、 */ + else if (c1=='#') fold_state = 1; /* 。 */ + else if (c1=='W') fold_state = 1; /* 」 */ + else if (c1=='K') fold_state = 1; /* ) */ + else if (c1=='$') fold_state = 1; /* , */ + else if (c1=='%') fold_state = 1; /* . */ + else if (c1=='\'') fold_state = 1; /* + */ + else if (c1=='(') fold_state = 1; /* ; */ + else if (c1==')') fold_state = 1; /* ? */ + else if (c1=='*') fold_state = 1; /* ! */ + else if (c1=='+') fold_state = 1; /* ゛ */ + else if (c1==',') fold_state = 1; /* ゜ */ + /* default no fold in kinsoku */ else { fold_state = LF; f_line = char_size(c2,c1); /* add one new f_line before this character */ } - } else { + } else { f_line = char_size(c2,c1); - fold_state = LF; - /* add one new f_line before this character */ - } - } - } + fold_state = LF; + /* add one new f_line before this character */ + } + } + } } /* terminator process */ switch(fold_state) { - case LF: - OCONV_NEWLINE((*o_fconv)); - (*o_fconv)(c2,c1); - break; - case 0: - return; - case CR: - OCONV_NEWLINE((*o_fconv)); - break; - case TAB: - case SP: - (*o_fconv)(0,SP); - break; - default: - (*o_fconv)(c2,c1); + case LF: + oconv_newline(o_fconv); + (*o_fconv)(c2,c1); + break; + case 0: + return; + case CR: + oconv_newline(o_fconv); + break; + case TAB: + case SP: + (*o_fconv)(0,SP); + break; + default: + (*o_fconv)(c2,c1); } } -nkf_char z_prev2=0,z_prev1=0; +static nkf_char z_prev2=0,z_prev1=0; -void z_conv(nkf_char c2, nkf_char c1) +static void +z_conv(nkf_char c2, nkf_char c1) { /* if (c2) c1 &= 0x7f; assertion */ - if (c2 == JIS_X_0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) { + if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) { (*o_zconv)(c2,c1); return; } if (x0201_f) { - if (z_prev2 == JIS_X_0201) { - if (c2 == JIS_X_0201) { + if (z_prev2 == JIS_X_0201_1976_K) { + if (c2 == JIS_X_0201_1976_K) { if (c1 == (0xde&0x7f)) { /* 濁点 */ z_prev2 = 0; (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]); @@ -4786,7 +3618,7 @@ void z_conv(nkf_char c2, nkf_char c1) z_prev2 = 0; (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]); } - if (c2 == JIS_X_0201) { + if (c2 == JIS_X_0201_1976_K) { if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) { /* wait for 濁点 or 半濁点 */ z_prev1 = c1; @@ -4800,33 +3632,33 @@ void z_conv(nkf_char c2, nkf_char c1) } if (c2 == EOF) { - (*o_zconv)(c2, c1); - return; + (*o_zconv)(c2, c1); + return; } if (alpha_f&1 && c2 == 0x23) { /* JISX0208 Alphabet */ - c2 = 0; + c2 = 0; } else if (c2 == 0x21) { /* JISX0208 Kigou */ - if (0x21==c1) { - if (alpha_f&2) { - c2 = 0; - c1 = SP; - } else if (alpha_f&4) { - (*o_zconv)(0, SP); - (*o_zconv)(0, SP); - return; - } - } else if (alpha_f&1 && 0x20': entity = ">"; break; case '<': entity = "<"; break; @@ -4842,7 +3674,7 @@ void z_conv(nkf_char c2, nkf_char c1) if (alpha_f & 16) { /* JIS X 0208 Katakana to JIS X 0201 Katakana */ if (c2 == 0x21) { - char c = 0; + nkf_char c = 0; switch (c1) { case 0x23: /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */ @@ -4878,7 +3710,7 @@ void z_conv(nkf_char c2, nkf_char c1) break; } if (c) { - (*o_zconv)(JIS_X_0201, c); + (*o_zconv)(JIS_X_0201_1976_K, c); return; } } else if (c2 == 0x25) { @@ -4900,9 +3732,9 @@ void z_conv(nkf_char c2, nkf_char c1) }; if (fullwidth_to_halfwidth[c1-0x20]){ c2 = fullwidth_to_halfwidth[c1-0x20]; - (*o_zconv)(JIS_X_0201, c2>>8); + (*o_zconv)(JIS_X_0201_1976_K, c2>>8); if (c2 & 0xFF) { - (*o_zconv)(JIS_X_0201, c2&0xFF); + (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF); } return; } @@ -4913,25 +3745,26 @@ void z_conv(nkf_char c2, nkf_char c1) #define rot13(c) ( \ - ( c < 'A') ? c: \ - (c <= 'M') ? (c + 13): \ - (c <= 'Z') ? (c - 13): \ - (c < 'a') ? (c): \ - (c <= 'm') ? (c + 13): \ - (c <= 'z') ? (c - 13): \ - (c) \ -) + ( c < 'A') ? c: \ + (c <= 'M') ? (c + 13): \ + (c <= 'Z') ? (c - 13): \ + (c < 'a') ? (c): \ + (c <= 'm') ? (c + 13): \ + (c <= 'z') ? (c - 13): \ + (c) \ + ) #define rot47(c) ( \ - ( c < '!') ? c: \ - ( c <= 'O') ? (c + 47) : \ - ( c <= '~') ? (c - 47) : \ - c \ -) - -void rot_conv(nkf_char c2, nkf_char c1) + ( c < '!') ? c: \ + ( c <= 'O') ? (c + 47) : \ + ( c <= '~') ? (c - 47) : \ + c \ + ) + +static void +rot_conv(nkf_char c2, nkf_char c1) { - if (c2==0 || c2==JIS_X_0201 || c2==ISO_8859_1) { + if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) { c1 = rot13(c1); } else if (c2) { c1 = rot47(c1); @@ -4940,61 +3773,64 @@ void rot_conv(nkf_char c2, nkf_char c1) (*o_rot_conv)(c2,c1); } -void hira_conv(nkf_char c2, nkf_char c1) +static void +hira_conv(nkf_char c2, nkf_char c1) { if (hira_f & 1) { - if (c2 == 0x25) { - if (0x20 < c1 && c1 < 0x74) { - c2 = 0x24; - (*o_hira_conv)(c2,c1); - return; - } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) { - c2 = 0; - c1 = CLASS_UNICODE | 0x3094; - (*o_hira_conv)(c2,c1); - return; - } - } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) { - c1 += 2; - (*o_hira_conv)(c2,c1); - return; - } + if (c2 == 0x25) { + if (0x20 < c1 && c1 < 0x74) { + c2 = 0x24; + (*o_hira_conv)(c2,c1); + return; + } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) { + c2 = 0; + c1 = nkf_char_unicode_new(0x3094); + (*o_hira_conv)(c2,c1); + return; + } + } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) { + c1 += 2; + (*o_hira_conv)(c2,c1); + return; + } } if (hira_f & 2) { - if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) { - c2 = 0x25; - c1 = 0x74; - } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) { - c2 = 0x25; - } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) { - c1 -= 2; - } + if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) { + c2 = 0x25; + c1 = 0x74; + } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) { + c2 = 0x25; + } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) { + c1 -= 2; + } } (*o_hira_conv)(c2,c1); } -void iso2022jp_check_conv(nkf_char c2, nkf_char c1) +static void +iso2022jp_check_conv(nkf_char c2, nkf_char c1) { +#define RANGE_NUM_MAX 18 static const nkf_char range[RANGE_NUM_MAX][2] = { - {0x222f, 0x2239,}, - {0x2242, 0x2249,}, - {0x2251, 0x225b,}, - {0x226b, 0x2271,}, - {0x227a, 0x227d,}, - {0x2321, 0x232f,}, - {0x233a, 0x2340,}, - {0x235b, 0x2360,}, - {0x237b, 0x237e,}, - {0x2474, 0x247e,}, - {0x2577, 0x257e,}, - {0x2639, 0x2640,}, - {0x2659, 0x267e,}, - {0x2742, 0x2750,}, - {0x2772, 0x277e,}, - {0x2841, 0x287e,}, - {0x4f54, 0x4f7e,}, - {0x7425, 0x747e}, + {0x222f, 0x2239,}, + {0x2242, 0x2249,}, + {0x2251, 0x225b,}, + {0x226b, 0x2271,}, + {0x227a, 0x227d,}, + {0x2321, 0x232f,}, + {0x233a, 0x2340,}, + {0x235b, 0x2360,}, + {0x237b, 0x237e,}, + {0x2474, 0x247e,}, + {0x2577, 0x257e,}, + {0x2639, 0x2640,}, + {0x2659, 0x267e,}, + {0x2742, 0x2750,}, + {0x2772, 0x277e,}, + {0x2841, 0x287e,}, + {0x4f54, 0x4f7e,}, + {0x7425, 0x747e}, }; nkf_char i; nkf_char start, end, c; @@ -5029,6 +3865,7 @@ static const unsigned char *mime_pattern[] = { (const unsigned char *)"\075?ISO-8859-1?Q?", (const unsigned char *)"\075?ISO-8859-1?B?", (const unsigned char *)"\075?ISO-2022-JP?B?", + (const unsigned char *)"\075?ISO-2022-JP?B?", (const unsigned char *)"\075?ISO-2022-JP?Q?", #if defined(UTF8_INPUT_ENABLE) (const unsigned char *)"\075?UTF-8?B?", @@ -5049,7 +3886,7 @@ nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = { }; static const nkf_char mime_encode[] = { - EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201, + EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K, #if defined(UTF8_INPUT_ENABLE) UTF_8, UTF_8, #endif @@ -5058,7 +3895,7 @@ static const nkf_char mime_encode[] = { }; static const nkf_char mime_encode_method[] = { - 'B', 'B','Q', 'B', 'B', 'Q', + 'B', 'B','Q', 'B', 'B', 'B', 'Q', #if defined(UTF8_INPUT_ENABLE) 'B', 'Q', #endif @@ -5067,9 +3904,55 @@ static const nkf_char mime_encode_method[] = { }; +/* MIME preprocessor fifo */ + +#define MIME_BUF_SIZE (1024) /* 2^n ring buffer */ +#define MIME_BUF_MASK (MIME_BUF_SIZE-1) +#define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK] +static struct { + unsigned char buf[MIME_BUF_SIZE]; + unsigned int top; + unsigned int last; /* decoded */ + unsigned int input; /* undecoded */ +} mime_input_state; +static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL; + #define MAXRECOVER 20 -void switch_mime_getc(void) +static void +mime_input_buf_unshift(nkf_char c) +{ + mime_input_buf(--mime_input_state.top) = (unsigned char)c; +} + +static nkf_char +mime_ungetc(nkf_char c, FILE *f) +{ + mime_input_buf_unshift(c); + return c; +} + +static nkf_char +mime_ungetc_buf(nkf_char c, FILE *f) +{ + if (mimebuf_f) + (*i_mungetc_buf)(c,f); + else + mime_input_buf(--mime_input_state.input) = (unsigned char)c; + return c; +} + +static nkf_char +mime_getc_buf(FILE *f) +{ + /* we don't keep eof of mime_input_buf, becase it contains ?= as + a terminator. It was checked in mime_integrity. */ + return ((mimebuf_f)? + (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++)); +} + +static void +switch_mime_getc(void) { if (i_getc!=mime_getc) { i_mgetc = i_getc; i_getc = mime_getc; @@ -5081,7 +3964,8 @@ void switch_mime_getc(void) } } -void unswitch_mime_getc(void) +static void +unswitch_mime_getc(void) { if(mime_f==STRICT_MIME) { i_mgetc = i_mgetc_buf; @@ -5093,7 +3977,47 @@ void unswitch_mime_getc(void) mime_iconv_back = NULL; } -nkf_char mime_begin_strict(FILE *f) +static nkf_char +mime_integrity(FILE *f, const unsigned char *p) +{ + nkf_char c,d; + unsigned int q; + /* In buffered mode, read until =? or NL or buffer full + */ + mime_input_state.input = mime_input_state.top; + mime_input_state.last = mime_input_state.top; + + while(*p) mime_input_buf(mime_input_state.input++) = *p++; + d = 0; + q = mime_input_state.input; + while((c=(*i_getc)(f))!=EOF) { + if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) { + break; /* buffer full */ + } + if (c=='=' && d=='?') { + /* checked. skip header, start decode */ + mime_input_buf(mime_input_state.input++) = (unsigned char)c; + /* mime_last_input = mime_input_state.input; */ + mime_input_state.input = q; + switch_mime_getc(); + return 1; + } + if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c)))) + break; + /* Should we check length mod 4? */ + mime_input_buf(mime_input_state.input++) = (unsigned char)c; + d=c; + } + /* In case of Incomplete MIME, no MIME decode */ + mime_input_buf(mime_input_state.input++) = (unsigned char)c; + mime_input_state.last = mime_input_state.input; /* point undecoded buffer */ + mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */ + switch_mime_getc(); /* anyway we need buffered getc */ + return 1; +} + +static nkf_char +mime_begin_strict(FILE *f) { nkf_char c1 = 0; int i,j,k; @@ -5107,24 +4031,24 @@ nkf_char mime_begin_strict(FILE *f) r[0]='='; r[1]='?'; for(i=2;p[i]>SP;i++) { /* start at =? */ - if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) { - /* pattern fails, try next one */ - q = p; - while (mime_pattern[++j]) { + if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) { + /* pattern fails, try next one */ + q = p; + while (mime_pattern[++j]) { p = mime_pattern[j]; - for(k=2;k i */ - if (p[k]!=q[k]) break; - if (k==i && nkf_toupper(c1)==p[k]) break; - } + for(k=2;k i */ + if (p[k]!=q[k]) break; + if (k==i && nkf_toupper(c1)==p[k]) break; + } p = mime_pattern[j]; - if (p) continue; /* found next one, continue */ - /* all fails, output from recovery buffer */ - (*i_ungetc)(c1,f); - for(j=0;j 0){ - (*u)(buf[i], f); - --i; + (*u)(buf[i], f); + --i; } return buf[0]; } -nkf_char numchar_ungetc(nkf_char c, FILE *f) +static nkf_char +numchar_ungetc(nkf_char c, FILE *f) { return (*i_nungetc)(c, f); } @@ -5395,62 +4315,100 @@ nkf_char numchar_ungetc(nkf_char c, FILE *f) #ifdef UNICODE_NORMALIZATION -/* Normalization Form C */ -nkf_char nfc_getc(FILE *f) +static nkf_char +nfc_getc(FILE *f) { nkf_char (*g)(FILE *f) = i_nfc_getc; nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc; - int i=0, j, k=1, lower, upper; - nkf_char buf[9]; - const nkf_nfchar *array; - - buf[i] = (*g)(f); - while (k > 0 && ((buf[i] & 0xc0) != 0x80)){ - lower=0, upper=NORMALIZATION_TABLE_LENGTH-1; - while (upper >= lower) { - j = (lower+upper) / 2; - array = normalization_table[j].nfd; - for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){ - if (array[k] != buf[k]){ - array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1); - k = 0; + nkf_buf_t *buf = nkf_state->nfc_buf; + const unsigned char *array; + int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1; + nkf_char c = (*g)(f); + + if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c; + + nkf_buf_push(buf, c); + do { + while (lower <= upper) { + int mid = (lower+upper) / 2; + int len; + array = normalization_table[mid].nfd; + for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) { + if (len >= nkf_buf_length(buf)) { + c = (*g)(f); + if (c == EOF) { + len = 0; + lower = 1, upper = 0; + break; + } + nkf_buf_push(buf, c); + } + if (array[len] != nkf_buf_at(buf, len)) { + if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1; + else upper = mid - 1; + len = 0; break; - } else if (k >= i) - buf[++i] = (*g)(f); + } } - if (k > 0){ - array = normalization_table[j].nfc; + if (len > 0) { + int i; + array = normalization_table[mid].nfc; + nkf_buf_clear(buf); for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++) - buf[i] = (nkf_char)(array[i]); - i--; + nkf_buf_push(buf, array[i]); break; } } - while (i > 0) - (*u)(buf[i--], f); - } - return buf[0]; + } while (lower <= upper); + + while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f); + c = nkf_buf_pop(buf); + + return c; } -nkf_char nfc_ungetc(nkf_char c, FILE *f) +static nkf_char +nfc_ungetc(nkf_char c, FILE *f) { return (*i_nfc_ungetc)(c, f); } #endif /* UNICODE_NORMALIZATION */ -nkf_char -mime_getc(FILE *f) +static nkf_char +base64decode(nkf_char c) { - nkf_char c1, c2, c3, c4, cc; - nkf_char t1, t2, t3, t4, mode, exit_mode; - nkf_char lwsp_count; - char *lwsp_buf; + int i; + if (c > '@') { + if (c < '[') { + i = c - 'A'; /* A..Z 0-25 */ + } else if (c == '_') { + i = '?' /* 63 */ ; /* _ 63 */ + } else { + i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */ + } + } else if (c > '/') { + i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */ + } else if (c == '+' || c == '-') { + i = '>' /* 62 */ ; /* + and - 62 */ + } else { + i = '?' /* 63 */ ; /* / 63 */ + } + return (i); +} + +static nkf_char +mime_getc(FILE *f) +{ + nkf_char c1, c2, c3, c4, cc; + nkf_char t1, t2, t3, t4, mode, exit_mode; + nkf_char lwsp_count; + char *lwsp_buf; char *lwsp_buf_new; nkf_char lwsp_size = 128; - if (mime_top != mime_last) { /* Something is in FIFO */ - return Fifo(mime_top++); + if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */ + return mime_input_buf(mime_input_state.top++); } if (mime_decode_mode==1 ||mime_decode_mode==FALSE) { mime_decode_mode=FALSE; @@ -5459,38 +4417,34 @@ mime_getc(FILE *f) } if (mimebuf_f == FIXED_MIME) - exit_mode = mime_decode_mode; + exit_mode = mime_decode_mode; else - exit_mode = FALSE; + exit_mode = FALSE; if (mime_decode_mode == 'Q') { - if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF); -restart_mime_q: - if (c1=='_' && mimebuf_f != FIXED_MIME) return SP; + if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF); + restart_mime_q: + if (c1=='_' && mimebuf_f != FIXED_MIME) return SP; if (c1<=SP || DEL<=c1) { mime_decode_mode = exit_mode; /* prepare for quit */ return c1; } - if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) { + if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) { return c1; } - mime_decode_mode = exit_mode; /* prepare for quit */ - if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF); - if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) { - /* end Q encoding */ - input_mode = exit_mode; + mime_decode_mode = exit_mode; /* prepare for quit */ + if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF); + if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) { + /* end Q encoding */ + input_mode = exit_mode; lwsp_count = 0; - lwsp_buf = malloc((lwsp_size+5)*sizeof(char)); - if (lwsp_buf==NULL) { - perror("can't malloc"); - return -1; - } + lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char)); while ((c1=(*i_getc)(f))!=EOF) { switch (c1) { case LF: case CR: if (c1==LF) { - if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) { + if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { i_ungetc(SP,f); continue; } else { @@ -5499,7 +4453,7 @@ restart_mime_q: c1 = LF; } else { if ((c1=(*i_getc)(f))!=EOF && c1 == LF) { - if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) { + if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) { i_ungetc(SP,f); continue; } else { @@ -5517,12 +4471,7 @@ restart_mime_q: lwsp_buf[lwsp_count] = (unsigned char)c1; if (lwsp_count++>lwsp_size){ lwsp_size <<= 1; - lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char)); - if (lwsp_buf_new==NULL) { - free(lwsp_buf); - perror("can't realloc"); - return -1; - } + lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char)); lwsp_buf = lwsp_buf_new; } continue; @@ -5535,70 +4484,66 @@ restart_mime_q: i_ungetc(lwsp_buf[lwsp_count],f); c1 = lwsp_buf[0]; } - free(lwsp_buf); - return c1; - } - if (c1=='='&&c2lwsp_size){ lwsp_size <<= 1; - lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char)); - if (lwsp_buf_new==NULL) { - free(lwsp_buf); - perror("can't realloc"); - return -1; - } + lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char)); lwsp_buf = lwsp_buf_new; } continue; @@ -5646,24 +4586,24 @@ mime_c2_retry: i_ungetc(lwsp_buf[lwsp_count],f); c1 = lwsp_buf[0]; } - free(lwsp_buf); - return c1; + nkf_xfree(lwsp_buf); + return c1; } -mime_c3_retry: + mime_c3_retry: if ((c3 = (*i_mgetc)(f))<=SP) { - if (c3==EOF) - return (EOF); + if (c3==EOF) + return (EOF); if (mime_f != STRICT_MIME) goto mime_c3_retry; - if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; - return c3; + if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; + return c3; } -mime_c4_retry: + mime_c4_retry: if ((c4 = (*i_mgetc)(f))<=SP) { - if (c4==EOF) - return (EOF); + if (c4==EOF) + return (EOF); if (mime_f != STRICT_MIME) goto mime_c4_retry; - if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; - return c4; + if (mimebuf_f!=FIXED_MIME) input_mode = ASCII; + return c4; } mime_decode_mode = mode; /* still in MIME sigh... */ @@ -5676,94 +4616,33 @@ mime_c4_retry: t4 = 0x3f & base64decode(c4); cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03); if (c2 != '=') { - Fifo(mime_last++) = (unsigned char)cc; - cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f); - if (c3 != '=') { - Fifo(mime_last++) = (unsigned char)cc; - cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f); - if (c4 != '=') - Fifo(mime_last++) = (unsigned char)cc; - } - } else { - return c1; - } - return Fifo(mime_top++); -} - -nkf_char mime_ungetc(nkf_char c, FILE *f) -{ - Fifo(--mime_top) = (unsigned char)c; - return c; -} - -nkf_char mime_integrity(FILE *f, const unsigned char *p) -{ - nkf_char c,d; - unsigned int q; - /* In buffered mode, read until =? or NL or buffer full - */ - mime_input = mime_top; - mime_last = mime_top; - - while(*p) Fifo(mime_input++) = *p++; - d = 0; - q = mime_input; - while((c=(*i_getc)(f))!=EOF) { - if (((mime_input-mime_top)&MIME_BUF_MASK)==0) { - break; /* buffer full */ + mime_input_buf(mime_input_state.last++) = (unsigned char)cc; + cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f); + if (c3 != '=') { + mime_input_buf(mime_input_state.last++) = (unsigned char)cc; + cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f); + if (c4 != '=') + mime_input_buf(mime_input_state.last++) = (unsigned char)cc; } - if (c=='=' && d=='?') { - /* checked. skip header, start decode */ - Fifo(mime_input++) = (unsigned char)c; - /* mime_last_input = mime_input; */ - mime_input = q; - switch_mime_getc(); - return 1; - } - if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c)))) - break; - /* Should we check length mod 4? */ - Fifo(mime_input++) = (unsigned char)c; - d=c; - } - /* In case of Incomplete MIME, no MIME decode */ - Fifo(mime_input++) = (unsigned char)c; - mime_last = mime_input; /* point undecoded buffer */ - mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */ - switch_mime_getc(); /* anyway we need buffered getc */ - return 1; -} - -nkf_char base64decode(nkf_char c) -{ - int i; - if (c > '@') { - if (c < '[') { - i = c - 'A'; /* A..Z 0-25 */ - } else if (c == '_') { - i = '?' /* 63 */ ; /* _ 63 */ - } else { - i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */ - } - } else if (c > '/') { - i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */ - } else if (c == '+' || c == '-') { - i = '>' /* 62 */ ; /* + and - 62 */ } else { - i = '?' /* 63 */ ; /* / 63 */ + return c1; } - return (i); + return mime_input_buf(mime_input_state.top++); } static const char basis_64[] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; -static nkf_char b64c; -#define MIMEOUT_BUF_LENGTH (60) -char mimeout_buf[MIMEOUT_BUF_LENGTH+1]; -int mimeout_buf_count = 0; +#define MIMEOUT_BUF_LENGTH 74 +static struct { + unsigned char buf[MIMEOUT_BUF_LENGTH+1]; + int count; +} mimeout_state; -void open_mime(nkf_char mode) +/*nkf_char mime_lastchar2, mime_lastchar1;*/ + +static void +open_mime(nkf_char mode) { const unsigned char *p; int i; @@ -5778,40 +4657,71 @@ void open_mime(nkf_char mode) mimeout_mode = mime_encode_method[i]; i = 0; if (base64_count>45) { - if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){ - (*o_mputc)(mimeout_buf[i]); + if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){ + (*o_mputc)(mimeout_state.buf[i]); i++; } - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); (*o_mputc)(SP); base64_count = 1; - if (mimeout_buf_count>0 - && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB - || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) { + if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) { i++; } } - for (;i 0){ + if (c2 == EOF){ + if (base64_count + mimeout_state.count/3*4> 73){ + (*o_base64conv)(EOF,0); + oconv_newline(o_base64conv); + (*o_base64conv)(0,SP); + base64_count = 1; + } + } else { + if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) { + (*o_base64conv)(EOF,0); + oconv_newline(o_base64conv); + (*o_base64conv)(0,SP); + base64_count = 1; + mimeout_mode = -1; + } + } + } else if (c2) { + if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) { + mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B'; + open_mime(output_mode); + (*o_base64conv)(EOF,0); + oconv_newline(o_base64conv); + (*o_base64conv)(0,SP); + base64_count = 1; + mimeout_mode = -1; + } } } -void close_mime(void) +static void +close_mime(void) { (*o_mputc)('?'); (*o_mputc)('='); @@ -5819,20 +4729,21 @@ void close_mime(void) mimeout_mode = 0; } -void eof_mime(void) +static void +eof_mime(void) { switch(mimeout_mode) { case 'Q': case 'B': break; case 2: - (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]); + (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]); (*o_mputc)('='); (*o_mputc)('='); base64_count += 3; break; case 1: - (*o_mputc)(basis_64[((b64c & 0xF) << 2)]); + (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]); (*o_mputc)('='); base64_count += 2; break; @@ -5845,7 +4756,8 @@ void eof_mime(void) } } -void mimeout_addchar(nkf_char c) +static void +mimeout_addchar(nkf_char c) { switch(mimeout_mode) { case 'Q': @@ -5861,156 +4773,123 @@ void mimeout_addchar(nkf_char c) (*o_mputc)(c); base64_count++; } - break; + break; case 'B': - b64c=c; - (*o_mputc)(basis_64[c>>2]); - mimeout_mode=2; - base64_count ++; - break; + nkf_state->mimeout_state=c; + (*o_mputc)(basis_64[c>>2]); + mimeout_mode=2; + base64_count ++; + break; case 2: - (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]); - b64c=c; - mimeout_mode=1; - base64_count ++; - break; + (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]); + nkf_state->mimeout_state=c; + mimeout_mode=1; + base64_count ++; + break; case 1: - (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]); - (*o_mputc)(basis_64[c & 0x3F]); - mimeout_mode='B'; - base64_count += 2; - break; + (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]); + (*o_mputc)(basis_64[c & 0x3F]); + mimeout_mode='B'; + base64_count += 2; + break; default: (*o_mputc)(c); base64_count++; - break; - } -} - -/*nkf_char mime_lastchar2, mime_lastchar1;*/ - -void mime_prechar(nkf_char c2, nkf_char c1) -{ - if (mimeout_mode > 0){ - if (c2 == EOF){ - if (base64_count + mimeout_buf_count/3*4> 73){ - (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); - (*o_base64conv)(0,SP); - base64_count = 1; - } - } else { - if (base64_count + mimeout_buf_count/3*4> 66) { - (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); - (*o_base64conv)(0,SP); - base64_count = 1; - mimeout_mode = -1; - } - } - } else if (c2) { - if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) { - mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B'; - open_mime(output_mode); - (*o_base64conv)(EOF,0); - OCONV_NEWLINE((*o_base64conv)); - (*o_base64conv)(0,SP); - base64_count = 1; - mimeout_mode = -1; - } + break; } } -void mime_putc(nkf_char c) +static void +mime_putc(nkf_char c) { int i, j; nkf_char lastchar; if (mimeout_f == FIXED_MIME){ - if (mimeout_mode == 'Q'){ - if (base64_count > 71){ - if (c!=CR && c!=LF) { - (*o_mputc)('='); - PUT_NEWLINE((*o_mputc)); - } - base64_count = 0; - } - }else{ - if (base64_count > 71){ - eof_mime(); - PUT_NEWLINE((*o_mputc)); - base64_count = 0; - } - if (c == EOF) { /* c==EOF */ - eof_mime(); - } - } - if (c != EOF) { /* c==EOF */ - mimeout_addchar(c); - } - return; + if (mimeout_mode == 'Q'){ + if (base64_count > 71){ + if (c!=CR && c!=LF) { + (*o_mputc)('='); + put_newline(o_mputc); + } + base64_count = 0; + } + }else{ + if (base64_count > 71){ + eof_mime(); + put_newline(o_mputc); + base64_count = 0; + } + if (c == EOF) { /* c==EOF */ + eof_mime(); + } + } + if (c != EOF) { /* c==EOF */ + mimeout_addchar(c); + } + return; } /* mimeout_f != FIXED_MIME */ if (c == EOF) { /* c==EOF */ - if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode); - j = mimeout_buf_count; - mimeout_buf_count = 0; + if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode); + j = mimeout_state.count; + mimeout_state.count = 0; i = 0; if (mimeout_mode > 0) { - if (!nkf_isblank(mimeout_buf[j-1])) { + if (!nkf_isblank(mimeout_state.buf[j-1])) { for (;i 0){ - lastchar = mimeout_buf[mimeout_buf_count - 1]; + if (mimeout_state.count > 0){ + lastchar = mimeout_state.buf[mimeout_state.count - 1]; }else{ - lastchar = -1; + lastchar = -1; } if (mimeout_mode=='Q') { - if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { + if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { if (c == CR || c == LF) { close_mime(); (*o_mputc)(c); base64_count = 0; return; - } else if (c <= SP) { - close_mime(); + } else if (c <= SP) { + close_mime(); if (base64_count > 70) { - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); base64_count = 0; } if (!nkf_isblank(c)) { (*o_mputc)(SP); base64_count++; } - } else { + } else { if (base64_count > 70) { close_mime(); - PUT_NEWLINE((*o_mputc)); + put_newline(o_mputc); (*o_mputc)(SP); base64_count = 1; open_mime(output_mode); @@ -6020,145 +4899,174 @@ void mime_putc(nkf_char c) return; } } - (*o_mputc)(c); - base64_count++; - } - return; + if (c != 0x1B) { + (*o_mputc)(c); + base64_count++; + return; + } + } } if (mimeout_mode <= 0) { - if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) { - if (nkf_isspace(c)) { + if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 || + output_mode == UTF_8)) { + if (nkf_isspace(c)) { int flag = 0; if (mimeout_mode == -1) { flag = 1; } - if (c==CR || c==LF) { + if (c==CR || c==LF) { if (flag) { open_mime(output_mode); output_mode = 0; } else { base64_count = 0; } - } - for (i=0;i 1 - && base64_count + mimeout_buf_count > 76 - && mimeout_buf[0] != CR && mimeout_buf[0] != LF){ - PUT_NEWLINE((*o_mputc)); - base64_count = 0; - if (!nkf_isspace(mimeout_buf[0])){ - (*o_mputc)(SP); - base64_count++; - } - } - mimeout_buf[mimeout_buf_count++] = (char)c; - if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) { - open_mime(output_mode); - } - } - return; - }else{ - if (lastchar==CR || lastchar == LF){ - for (i=0;i 1 + && base64_count + mimeout_state.count > 76 + && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){ + static const char *str = "boundary=\""; + static int len = 10; + i = 0; + + for (; i < mimeout_state.count - len; ++i) { + if (!strncmp((char *)(mimeout_state.buf+i), str, len)) { + i += len - 2; + break; + } + } + + if (i == 0 || i == mimeout_state.count - len) { + put_newline(o_mputc); + base64_count = 0; + if (!nkf_isspace(mimeout_state.buf[0])){ + (*o_mputc)(SP); + base64_count++; + } + } + else { + int j; + for (j = 0; j <= i; ++j) { + (*o_mputc)(mimeout_state.buf[j]); + } + put_newline(o_mputc); + base64_count = 1; + for (; j <= mimeout_state.count; ++j) { + mimeout_state.buf[j - i] = mimeout_state.buf[j]; + } + mimeout_state.count -= i; + } + } + mimeout_state.buf[mimeout_state.count++] = (char)c; + if (mimeout_state.count>MIMEOUT_BUF_LENGTH) { + open_mime(output_mode); + } + } + return; + }else{ + if (lastchar==CR || lastchar == LF){ + for (i=0;iMIMEOUT_BUF_LENGTH) { + eof_mime(); + for (i=0;i0 && SPMIMEOUT_BUF_LENGTH) { + j = mimeout_state.count; + mimeout_state.count = 0; + for (i=0;iMIMEOUT_BUF_LENGTH) { - eof_mime(); - for (i=0;i0 && SPMIMEOUT_BUF_LENGTH) { - j = mimeout_buf_count; - mimeout_buf_count = 0; - for (i=0;i0) { - j = mimeout_buf_count; - mimeout_buf_count = 0; + } + } + if (mimeout_state.count>0) { + j = mimeout_state.count; + mimeout_state.count = 0; for (i=0;iinput_buffer_size = IOBUF_SIZE; + converter->input_buffer = nkf_xmalloc(converter->input_buffer_size); + converter->output_buffer_size = IOBUF_SIZE * 2; + converter->output_buffer = nkf_xmalloc(converter->output_buffer_size); + converter->cd = iconv_open(tocode, fromcode); + if (converter->cd == (iconv_t)-1) + { + switch (errno) { + case EINVAL: + perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode)); + return -1; + default: + perror("can't iconv_open"); + } + } +} + +static size_t +nkf_iconv_convert(nkf_iconv_t *converter, FILE *input) +{ + size_t invalid = (size_t)0; + char *input_buffer = converter->input_buffer; + size_t input_length = (size_t)0; + char *output_buffer = converter->output_buffer; + size_t output_length = converter->output_buffer_size; + int c; + + do { + if (c != EOF) { + while ((c = (*i_getc)(f)) != EOF) { + input_buffer[input_length++] = c; + if (input_length < converter->input_buffer_size) break; + } + } + + size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length); + while (output_length-- > 0) { + (*o_putc)(output_buffer[converter->output_buffer_size-output_length]); + } + if (ret == (size_t) - 1) { + switch (errno) { + case EINVAL: + if (input_buffer != converter->input_buffer) + memmove(converter->input_buffer, input_buffer, input_length); + break; + case E2BIG: + converter->output_buffer_size *= 2; + output_buffer = realloc(converter->outbuf, converter->output_buffer_size); + if (output_buffer == NULL) { + perror("can't realloc"); + return -1; + } + converter->output_buffer = output_buffer; + break; + default: + perror("can't iconv"); + return -1; + } + } else { + invalid += ret; + } + } while (1); + + return invalid; +} + + +static void +nkf_iconv_close(nkf_iconv_t *convert) +{ + nkf_xfree(converter->inbuf); + nkf_xfree(converter->outbuf); + iconv_close(converter->cd); +} +#endif + -void reinit(void) +static void +reinit(void) { { - struct input_code *p = input_code_list; - while (p->name){ - status_reinit(p++); - } + struct input_code *p = input_code_list; + while (p->name){ + status_reinit(p++); + } } unbuf_f = FALSE; estab_f = FALSE; @@ -6188,248 +5194,1524 @@ void reinit(void) broken_f = FALSE; iso8859_f = FALSE; mimeout_f = FALSE; - x0201_f = X0201_DEFAULT; + x0201_f = NKF_UNSPECIFIED; iso2022jp_f = FALSE; #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE) ms_ucs_map_f = UCS_MAP_ASCII; #endif -#ifdef UTF8_INPUT_ENABLE - no_cp932ext_f = FALSE; - no_best_fit_chars_f = FALSE; - encode_fallback = NULL; - unicode_subchar = '?'; - input_endian = ENDIAN_BIG; +#ifdef UTF8_INPUT_ENABLE + no_cp932ext_f = FALSE; + no_best_fit_chars_f = FALSE; + encode_fallback = NULL; + unicode_subchar = '?'; + input_endian = ENDIAN_BIG; +#endif +#ifdef UTF8_OUTPUT_ENABLE + output_bom_f = FALSE; + output_endian = ENDIAN_BIG; +#endif +#ifdef UNICODE_NORMALIZATION + nfc_f = FALSE; +#endif +#ifdef INPUT_OPTION + cap_f = FALSE; + url_f = FALSE; + numchar_f = FALSE; +#endif +#ifdef CHECK_OPTION + noout_f = FALSE; + debug_f = FALSE; +#endif + guess_f = 0; +#ifdef EXEC_IO + exec_f = 0; +#endif +#ifdef SHIFTJIS_CP932 + cp51932_f = TRUE; + cp932inv_f = TRUE; +#endif +#ifdef X0212_ENABLE + x0212_f = FALSE; + x0213_f = FALSE; +#endif + { + int i; + for (i = 0; i < 256; i++){ + prefix_table[i] = 0; + } + } + hold_count = 0; + mimeout_state.count = 0; + mimeout_mode = 0; + base64_count = 0; + f_line = 0; + f_prev = 0; + fold_preserve_f = FALSE; + fold_f = FALSE; + fold_len = 0; + kanji_intro = DEFAULT_J; + ascii_intro = DEFAULT_R; + fold_margin = FOLD_MARGIN; + o_zconv = no_connection; + o_fconv = no_connection; + o_eol_conv = no_connection; + o_rot_conv = no_connection; + o_hira_conv = no_connection; + o_base64conv = no_connection; + o_iso2022jp_check_conv = no_connection; + o_putc = std_putc; + i_getc = std_getc; + i_ungetc = std_ungetc; + i_bgetc = std_getc; + i_bungetc = std_ungetc; + o_mputc = std_putc; + i_mgetc = std_getc; + i_mungetc = std_ungetc; + i_mgetc_buf = std_getc; + i_mungetc_buf = std_ungetc; + output_mode = ASCII; + input_mode = ASCII; + mime_decode_mode = FALSE; + file_out_f = FALSE; + eolmode_f = 0; + input_eol = 0; + prev_cr = 0; + option_mode = 0; + z_prev2=0,z_prev1=0; +#ifdef CHECK_OPTION + iconv_for_check = 0; +#endif + input_codename = NULL; + input_encoding = NULL; + output_encoding = NULL; + nkf_state_init(); +#ifdef WIN32DLL + reinitdll(); +#endif /*WIN32DLL*/ +} + +static int +module_connection(void) +{ + if (input_encoding) set_input_encoding(input_encoding); + if (!output_encoding) { + output_encoding = nkf_default_encoding(); + } + if (!output_encoding) { + if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP); + else return -1; + } + set_output_encoding(output_encoding); + oconv = nkf_enc_to_oconv(output_encoding); + o_putc = std_putc; + if (nkf_enc_unicode_p(output_encoding)) + output_mode = UTF_8; + + if (x0201_f == NKF_UNSPECIFIED) { + x0201_f = X0201_DEFAULT; + } + + /* replace continucation module, from output side */ + + /* output redicrection */ +#ifdef CHECK_OPTION + if (noout_f || guess_f){ + o_putc = no_putc; + } +#endif + if (mimeout_f) { + o_mputc = o_putc; + o_putc = mime_putc; + if (mimeout_f == TRUE) { + o_base64conv = oconv; oconv = base64_conv; + } + /* base64_count = 0; */ + } + + if (eolmode_f || guess_f) { + o_eol_conv = oconv; oconv = eol_conv; + } + if (rot_f) { + o_rot_conv = oconv; oconv = rot_conv; + } + if (iso2022jp_f) { + o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv; + } + if (hira_f) { + o_hira_conv = oconv; oconv = hira_conv; + } + if (fold_f) { + o_fconv = oconv; oconv = fold_conv; + f_line = 0; + } + if (alpha_f || x0201_f) { + o_zconv = oconv; oconv = z_conv; + } + + i_getc = std_getc; + i_ungetc = std_ungetc; + /* input redicrection */ +#ifdef INPUT_OPTION + if (cap_f){ + i_cgetc = i_getc; i_getc = cap_getc; + i_cungetc = i_ungetc; i_ungetc= cap_ungetc; + } + if (url_f){ + i_ugetc = i_getc; i_getc = url_getc; + i_uungetc = i_ungetc; i_ungetc= url_ungetc; + } +#endif +#ifdef NUMCHAR_OPTION + if (numchar_f){ + i_ngetc = i_getc; i_getc = numchar_getc; + i_nungetc = i_ungetc; i_ungetc= numchar_ungetc; + } +#endif +#ifdef UNICODE_NORMALIZATION + if (nfc_f){ + i_nfc_getc = i_getc; i_getc = nfc_getc; + i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc; + } +#endif + if (mime_f && mimebuf_f==FIXED_MIME) { + i_mgetc = i_getc; i_getc = mime_getc; + i_mungetc = i_ungetc; i_ungetc = mime_ungetc; + } + if (broken_f & 1) { + i_bgetc = i_getc; i_getc = broken_getc; + i_bungetc = i_ungetc; i_ungetc = broken_ungetc; + } + if (input_encoding) { + set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding)); + } else { + set_iconv(FALSE, e_iconv); + } + + { + struct input_code *p = input_code_list; + while (p->name){ + status_reinit(p++); + } + } + return 0; +} + +/* + Conversion main loop. Code detection only. + */ + +#if !defined(PERL_XS) && !defined(WIN32DLL) +static nkf_char +noconvert(FILE *f) +{ + nkf_char c; + + if (nop_f == 2) + module_connection(); + while ((c = (*i_getc)(f)) != EOF) + (*o_putc)(c); + (*o_putc)(EOF); + return 1; +} +#endif + +#define NEXT continue /* no output, get next */ +#define SKIP c2=0;continue /* no output, get next */ +#define MORE c2=c1;continue /* need one more byte */ +#define SEND (void)0 /* output c1 and c2, get next */ +#define LAST break /* end of loop, go closing */ +#define set_input_mode(mode) do { \ + input_mode = mode; \ + shift_mode = 0; \ + set_input_codename("ISO-2022-JP"); \ + debug("ISO-2022-JP"); \ +} while (0) + +static int +kanji_convert(FILE *f) +{ + nkf_char c1=0, c2=0, c3=0, c4=0; + int shift_mode = 0; /* 0, 1, 2, 3 */ + int g2 = 0; + int is_8bit = FALSE; + + if (input_encoding && !nkf_enc_asciicompat(input_encoding)) { + is_8bit = TRUE; + } + + input_mode = ASCII; + output_mode = ASCII; + + if (module_connection() < 0) { +#if !defined(PERL_XS) && !defined(WIN32DLL) + fprintf(stderr, "no output encoding given\n"); +#endif + return -1; + } + check_bom(f); + +#ifdef UTF8_INPUT_ENABLE + if(iconv == w_iconv32){ + while ((c1 = (*i_getc)(f)) != EOF && + (c2 = (*i_getc)(f)) != EOF && + (c3 = (*i_getc)(f)) != EOF && + (c4 = (*i_getc)(f)) != EOF) { + nkf_iconv_utf_32(c1, c2, c3, c4); + } + goto finished; + } + else if (iconv == w_iconv16) { + while ((c1 = (*i_getc)(f)) != EOF && + (c2 = (*i_getc)(f)) != EOF) { + if (nkf_iconv_utf_16(c1, c2, 0, 0) == NKF_ICONV_NEED_TWO_MORE_BYTES && + (c3 = (*i_getc)(f)) != EOF && + (c4 = (*i_getc)(f)) != EOF) { + nkf_iconv_utf_16(c1, c2, c3, c4); + } + } + goto finished; + } +#endif + + while ((c1 = (*i_getc)(f)) != EOF) { +#ifdef INPUT_CODE_FIX + if (!input_encoding) +#endif + code_status(c1); + if (c2) { + /* second byte */ + if (c2 > DEL) { + /* in case of 8th bit is on */ + if (!estab_f&&!mime_decode_mode) { + /* in case of not established yet */ + /* It is still ambiguious */ + if (h_conv(f, c2, c1)==EOF) { + LAST; + } + else { + SKIP; + } + } + else { + /* in case of already established */ + if (c1 < 0x40) { + /* ignore bogus code */ + SKIP; + } else { + SEND; + } + } + } + else { + /* 2nd byte of 7 bit code or SJIS */ + SEND; + } + } + else if (nkf_char_unicode_p(c1)) { + (*oconv)(0, c1); + NEXT; + } + else { + /* first byte */ + if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) { + /* CP5022x */ + MORE; + }else if (input_codename && input_codename[0] == 'I' && + 0xA1 <= c1 && c1 <= 0xDF) { + /* JIS X 0201 Katakana in 8bit JIS */ + c2 = JIS_X_0201_1976_K; + c1 &= 0x7f; + SEND; + } else if (c1 > DEL) { + /* 8 bit code */ + if (!estab_f && !iso8859_f) { + /* not established yet */ + MORE; + } else { /* estab_f==TRUE */ + if (iso8859_f) { + c2 = ISO_8859_1; + c1 &= 0x7f; + SEND; + } + else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) || + (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) { + /* JIS X 0201 */ + c2 = JIS_X_0201_1976_K; + c1 &= 0x7f; + SEND; + } + else { + /* already established */ + MORE; + } + } + } else if (SP < c1 && c1 < DEL) { + /* in case of Roman characters */ + if (shift_mode) { + /* output 1 shifted byte */ + if (iso8859_f) { + c2 = ISO_8859_1; + SEND; + } else if (nkf_byte_jisx0201_katakana_p(c1)){ + /* output 1 shifted byte */ + c2 = JIS_X_0201_1976_K; + SEND; + } else { + /* look like bogus code */ + SKIP; + } + } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 || + input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) { + /* in case of Kanji shifted */ + MORE; + } else if (c1 == '=' && mime_f && !mime_decode_mode) { + /* Check MIME code */ + if ((c1 = (*i_getc)(f)) == EOF) { + (*oconv)(0, '='); + LAST; + } else if (c1 == '?') { + /* =? is mime conversion start sequence */ + if(mime_f == STRICT_MIME) { + /* check in real detail */ + if (mime_begin_strict(f) == EOF) + LAST; + SKIP; + } else if (mime_begin(f) == EOF) + LAST; + SKIP; + } else { + (*oconv)(0, '='); + (*i_ungetc)(c1,f); + SKIP; + } + } else { + /* normal ASCII code */ + SEND; + } + } else if (c1 == SI && (!is_8bit || mime_decode_mode)) { + shift_mode = 0; + SKIP; + } else if (c1 == SO && (!is_8bit || mime_decode_mode)) { + shift_mode = 1; + SKIP; + } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) { + if ((c1 = (*i_getc)(f)) == EOF) { + (*oconv)(0, ESC); + LAST; + } + else if (c1 == '&') { + /* IRR */ + if ((c1 = (*i_getc)(f)) == EOF) { + LAST; + } else { + SKIP; + } + } + else if (c1 == '$') { + /* GZDMx */ + if ((c1 = (*i_getc)(f)) == EOF) { + /* don't send bogus code + (*oconv)(0, ESC); + (*oconv)(0, '$'); */ + LAST; + } else if (c1 == '@' || c1 == 'B') { + /* JIS X 0208 */ + set_input_mode(JIS_X_0208); + SKIP; + } else if (c1 == '(') { + /* GZDM4 */ + if ((c1 = (*i_getc)(f)) == EOF) { + /* don't send bogus code + (*oconv)(0, ESC); + (*oconv)(0, '$'); + (*oconv)(0, '('); + */ + LAST; + } else if (c1 == '@'|| c1 == 'B') { + /* JIS X 0208 */ + set_input_mode(JIS_X_0208); + SKIP; +#ifdef X0212_ENABLE + } else if (c1 == 'D'){ + set_input_mode(JIS_X_0212); + SKIP; +#endif /* X0212_ENABLE */ + } else if (c1 == 'O' || c1 == 'Q'){ + set_input_mode(JIS_X_0213_1); + SKIP; + } else if (c1 == 'P'){ + set_input_mode(JIS_X_0213_2); + SKIP; + } else { + /* could be some special code */ + (*oconv)(0, ESC); + (*oconv)(0, '$'); + (*oconv)(0, '('); + (*oconv)(0, c1); + SKIP; + } + } else if (broken_f&0x2) { + /* accept any ESC-(-x as broken code ... */ + input_mode = JIS_X_0208; + shift_mode = 0; + SKIP; + } else { + (*oconv)(0, ESC); + (*oconv)(0, '$'); + (*oconv)(0, c1); + SKIP; + } + } else if (c1 == '(') { + /* GZD4 */ + if ((c1 = (*i_getc)(f)) == EOF) { + /* don't send bogus code + (*oconv)(0, ESC); + (*oconv)(0, '('); */ + LAST; + } + else if (c1 == 'I') { + /* JIS X 0201 Katakana */ + set_input_mode(JIS_X_0201_1976_K); + SKIP; + } + else if (c1 == 'B' || c1 == 'J' || c1 == 'H') { + /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */ + set_input_mode(ASCII); + SKIP; + } + else if (broken_f&0x2) { + set_input_mode(ASCII); + SKIP; + } + else { + (*oconv)(0, ESC); + (*oconv)(0, '('); + SEND; + } + } + else if (c1 == '.') { + /* G2D6 */ + if ((c1 = (*i_getc)(f)) == EOF) { + LAST; + } + else if (c1 == 'A') { + /* ISO-8859-1 */ + g2 = ISO_8859_1; + SKIP; + } + else { + (*oconv)(0, ESC); + (*oconv)(0, '.'); + SEND; + } + } + else if (c1 == 'N') { + /* SS2 */ + c1 = (*i_getc)(f); + if (g2 == ISO_8859_1) { + c2 = ISO_8859_1; + SEND; + }else{ + (*i_ungetc)(c1, f); + /* lonely ESC */ + (*oconv)(0, ESC); + SEND; + } + } + else { + /* lonely ESC */ + (*oconv)(0, ESC); + SEND; + } + } else if (c1 == ESC && iconv == s_iconv) { + /* ESC in Shift_JIS */ + if ((c1 = (*i_getc)(f)) == EOF) { + (*oconv)(0, ESC); + LAST; + } else if (c1 == '$') { + /* J-PHONE emoji */ + if ((c1 = (*i_getc)(f)) == EOF) { + LAST; + } else if (('E' <= c1 && c1 <= 'G') || + ('O' <= c1 && c1 <= 'Q')) { + /* + NUM : 0 1 2 3 4 5 + BYTE: G E F O P Q + C%7 : 1 6 0 2 3 4 + C%7 : 0 1 2 3 4 5 6 + NUM : 2 0 3 4 5 X 1 + */ + static const nkf_char jphone_emoji_first_table[7] = + {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0}; + c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]); + if ((c1 = (*i_getc)(f)) == EOF) LAST; + while (SP <= c1 && c1 <= 'z') { + (*oconv)(0, c1 + c3); + if ((c1 = (*i_getc)(f)) == EOF) LAST; + } + SKIP; + } + else { + (*oconv)(0, ESC); + (*oconv)(0, '$'); + SEND; + } + } + else { + /* lonely ESC */ + (*oconv)(0, ESC); + SEND; + } + } else if (c1 == LF || c1 == CR) { + if (broken_f&4) { + input_mode = ASCII; set_iconv(FALSE, 0); + SEND; + } else if (mime_decode_f && !mime_decode_mode){ + if (c1 == LF) { + if ((c1=(*i_getc)(f))!=EOF && c1 == SP) { + i_ungetc(SP,f); + continue; + } else { + i_ungetc(c1,f); + } + c1 = LF; + SEND; + } else { /* if (c1 == CR)*/ + if ((c1=(*i_getc)(f))!=EOF) { + if (c1==SP) { + i_ungetc(SP,f); + continue; + } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) { + i_ungetc(SP,f); + continue; + } else { + i_ungetc(c1,f); + } + i_ungetc(LF,f); + } else { + i_ungetc(c1,f); + } + c1 = CR; + SEND; + } + } + } else + SEND; + } + /* send: */ + switch(input_mode){ + case ASCII: + switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */ + case -2: + /* 4 bytes UTF-8 */ + if ((c3 = (*i_getc)(f)) != EOF) { + code_status(c3); + c3 <<= 8; + if ((c4 = (*i_getc)(f)) != EOF) { + code_status(c4); + (*iconv)(c2, c1, c3|c4); + } + } + break; + case -1: + /* 3 bytes EUC or UTF-8 */ + if ((c3 = (*i_getc)(f)) != EOF) { + code_status(c3); + (*iconv)(c2, c1, c3); + } + break; + } + break; + case JIS_X_0208: + case JIS_X_0213_1: + if (ms_ucs_map_f && + 0x7F <= c2 && c2 <= 0x92 && + 0x21 <= c1 && c1 <= 0x7E) { + /* CP932 UDC */ + c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000); + c2 = 0; + } + (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */ + break; +#ifdef X0212_ENABLE + case JIS_X_0212: + (*oconv)(PREFIX_EUCG3 | c2, c1); + break; +#endif /* X0212_ENABLE */ + case JIS_X_0213_2: + (*oconv)(PREFIX_EUCG3 | c2, c1); + break; + default: + (*oconv)(input_mode, c1); /* other special case */ + } + + c2 = 0; + c3 = 0; + continue; + /* goto next_word */ + } + +finished: + /* epilogue */ + (*iconv)(EOF, 0, 0); + if (!input_codename) + { + if (is_8bit) { + struct input_code *p = input_code_list; + struct input_code *result = p; + while (p->name){ + if (p->score < result->score) result = p; + ++p; + } + set_input_codename(result->name); +#ifdef CHECK_OPTION + debug(result->name); +#endif + } + } + return 0; +} + +/* + * int options(unsigned char *cp) + * + * return values: + * 0: success + * -1: ArgumentError + */ +static int +options(unsigned char *cp) +{ + nkf_char i, j; + unsigned char *p; + unsigned char *cp_back = NULL; + nkf_encoding *enc; + + if (option_mode==1) + return 0; + while(*cp && *cp++!='-'); + while (*cp || cp_back) { + if(!*cp){ + cp = cp_back; + cp_back = NULL; + continue; + } + p = 0; + switch (*cp++) { + case '-': /* literal options */ + if (!*cp || *cp == SP) { /* ignore the rest of arguments */ + option_mode = 1; + return 0; + } + for (i=0;i<(int)(sizeof(long_option)/sizeof(long_option[0]));i++) { + p = (unsigned char *)long_option[i].name; + for (j=0;*p && *p != '=' && *p == cp[j];p++, j++); + if (*p == cp[j] || cp[j] == SP){ + p = &cp[j] + 1; + break; + } + p = 0; + } + if (p == 0) { +#if !defined(PERL_XS) && !defined(WIN32DLL) + fprintf(stderr, "unknown long option: --%s\n", cp); +#endif + return -1; + } + while(*cp && *cp != SP && cp++); + if (long_option[i].alias[0]){ + cp_back = cp; + cp = (unsigned char *)long_option[i].alias; + }else{ +#ifndef PERL_XS + if (strcmp(long_option[i].name, "help") == 0){ + usage(); + exit(EXIT_SUCCESS); + } +#endif + if (strcmp(long_option[i].name, "ic=") == 0){ + enc = nkf_enc_find((char *)p); + if (!enc) continue; + input_encoding = enc; + continue; + } + if (strcmp(long_option[i].name, "oc=") == 0){ + enc = nkf_enc_find((char *)p); + /* if (enc <= 0) continue; */ + if (!enc) continue; + output_encoding = enc; + continue; + } + if (strcmp(long_option[i].name, "guess=") == 0){ + if (p[0] == '0' || p[0] == '1') { + guess_f = 1; + } else { + guess_f = 2; + } + continue; + } +#ifdef OVERWRITE + if (strcmp(long_option[i].name, "overwrite") == 0){ + file_out_f = TRUE; + overwrite_f = TRUE; + preserve_time_f = TRUE; + continue; + } + if (strcmp(long_option[i].name, "overwrite=") == 0){ + file_out_f = TRUE; + overwrite_f = TRUE; + preserve_time_f = TRUE; + backup_f = TRUE; + backup_suffix = (char *)p; + continue; + } + if (strcmp(long_option[i].name, "in-place") == 0){ + file_out_f = TRUE; + overwrite_f = TRUE; + preserve_time_f = FALSE; + continue; + } + if (strcmp(long_option[i].name, "in-place=") == 0){ + file_out_f = TRUE; + overwrite_f = TRUE; + preserve_time_f = FALSE; + backup_f = TRUE; + backup_suffix = (char *)p; + continue; + } +#endif +#ifdef INPUT_OPTION + if (strcmp(long_option[i].name, "cap-input") == 0){ + cap_f = TRUE; + continue; + } + if (strcmp(long_option[i].name, "url-input") == 0){ + url_f = TRUE; + continue; + } +#endif +#ifdef NUMCHAR_OPTION + if (strcmp(long_option[i].name, "numchar-input") == 0){ + numchar_f = TRUE; + continue; + } +#endif +#ifdef CHECK_OPTION + if (strcmp(long_option[i].name, "no-output") == 0){ + noout_f = TRUE; + continue; + } + if (strcmp(long_option[i].name, "debug") == 0){ + debug_f = TRUE; + continue; + } +#endif + if (strcmp(long_option[i].name, "cp932") == 0){ +#ifdef SHIFTJIS_CP932 + cp51932_f = TRUE; + cp932inv_f = -TRUE; +#endif +#ifdef UTF8_OUTPUT_ENABLE + ms_ucs_map_f = UCS_MAP_CP932; +#endif + continue; + } + if (strcmp(long_option[i].name, "no-cp932") == 0){ +#ifdef SHIFTJIS_CP932 + cp51932_f = FALSE; + cp932inv_f = FALSE; +#endif +#ifdef UTF8_OUTPUT_ENABLE + ms_ucs_map_f = UCS_MAP_ASCII; +#endif + continue; + } +#ifdef SHIFTJIS_CP932 + if (strcmp(long_option[i].name, "cp932inv") == 0){ + cp932inv_f = -TRUE; + continue; + } +#endif + +#ifdef X0212_ENABLE + if (strcmp(long_option[i].name, "x0212") == 0){ + x0212_f = TRUE; + continue; + } +#endif + +#ifdef EXEC_IO + if (strcmp(long_option[i].name, "exec-in") == 0){ + exec_f = 1; + return 0; + } + if (strcmp(long_option[i].name, "exec-out") == 0){ + exec_f = -1; + return 0; + } +#endif +#if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE) + if (strcmp(long_option[i].name, "no-cp932ext") == 0){ + no_cp932ext_f = TRUE; + continue; + } + if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){ + no_best_fit_chars_f = TRUE; + continue; + } + if (strcmp(long_option[i].name, "fb-skip") == 0){ + encode_fallback = NULL; + continue; + } + if (strcmp(long_option[i].name, "fb-html") == 0){ + encode_fallback = encode_fallback_html; + continue; + } + if (strcmp(long_option[i].name, "fb-xml") == 0){ + encode_fallback = encode_fallback_xml; + continue; + } + if (strcmp(long_option[i].name, "fb-java") == 0){ + encode_fallback = encode_fallback_java; + continue; + } + if (strcmp(long_option[i].name, "fb-perl") == 0){ + encode_fallback = encode_fallback_perl; + continue; + } + if (strcmp(long_option[i].name, "fb-subchar") == 0){ + encode_fallback = encode_fallback_subchar; + continue; + } + if (strcmp(long_option[i].name, "fb-subchar=") == 0){ + encode_fallback = encode_fallback_subchar; + unicode_subchar = 0; + if (p[0] != '0'){ + /* decimal number */ + for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){ + unicode_subchar *= 10; + unicode_subchar += hex2bin(p[i]); + } + }else if(p[1] == 'x' || p[1] == 'X'){ + /* hexadecimal number */ + for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){ + unicode_subchar <<= 4; + unicode_subchar |= hex2bin(p[i]); + } + }else{ + /* octal number */ + for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){ + unicode_subchar *= 8; + unicode_subchar += hex2bin(p[i]); + } + } + w16e_conv(unicode_subchar, &i, &j); + unicode_subchar = i<<8 | j; + continue; + } +#endif +#ifdef UTF8_OUTPUT_ENABLE + if (strcmp(long_option[i].name, "ms-ucs-map") == 0){ + ms_ucs_map_f = UCS_MAP_MS; + continue; + } +#endif +#ifdef UNICODE_NORMALIZATION + if (strcmp(long_option[i].name, "utf8mac-input") == 0){ + nfc_f = TRUE; + continue; + } +#endif + if (strcmp(long_option[i].name, "prefix=") == 0){ + if (nkf_isgraph(p[0])){ + for (i = 1; nkf_isgraph(p[i]); i++){ + prefix_table[p[i]] = p[0]; + } + } + continue; + } +#if !defined(PERL_XS) && !defined(WIN32DLL) + fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name); +#endif + return -1; + } + continue; + case 'b': /* buffered mode */ + unbuf_f = FALSE; + continue; + case 'u': /* non bufferd mode */ + unbuf_f = TRUE; + continue; + case 't': /* transparent mode */ + if (*cp=='1') { + /* alias of -t */ + cp++; + nop_f = TRUE; + } else if (*cp=='2') { + /* + * -t with put/get + * + * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin + * + */ + cp++; + nop_f = 2; + } else + nop_f = TRUE; + continue; + case 'j': /* JIS output */ + case 'n': + output_encoding = nkf_enc_from_index(ISO_2022_JP); + continue; + case 'e': /* AT&T EUC output */ + output_encoding = nkf_enc_from_index(EUCJP_NKF); + continue; + case 's': /* SJIS output */ + output_encoding = nkf_enc_from_index(SHIFT_JIS); + continue; + case 'l': /* ISO8859 Latin-1 support, no conversion */ + iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */ + input_encoding = nkf_enc_from_index(ISO_8859_1); + continue; + case 'i': /* Kanji IN ESC-$-@/B */ + if (*cp=='@'||*cp=='B') + kanji_intro = *cp++; + continue; + case 'o': /* ASCII IN ESC-(-J/B/H */ + /* ESC ( H was used in initial JUNET messages */ + if (*cp=='J'||*cp=='B'||*cp=='H') + ascii_intro = *cp++; + continue; + case 'h': + /* + bit:1 katakana->hiragana + bit:2 hiragana->katakana + */ + if ('9'>= *cp && *cp>='0') + hira_f |= (*cp++ -'0'); + else + hira_f |= 1; + continue; + case 'r': + rot_f = TRUE; + continue; +#if defined(MSDOS) || defined(__OS2__) + case 'T': + binmode_f = FALSE; + continue; +#endif +#ifndef PERL_XS + case 'V': + show_configuration(); + exit(EXIT_SUCCESS); + break; + case 'v': + version(); + exit(EXIT_SUCCESS); + break; +#endif +#ifdef UTF8_OUTPUT_ENABLE + case 'w': /* UTF-{8,16,32} output */ + if (cp[0] == '8') { + cp++; + if (cp[0] == '0'){ + cp++; + output_encoding = nkf_enc_from_index(UTF_8N); + } else { + output_bom_f = TRUE; + output_encoding = nkf_enc_from_index(UTF_8_BOM); + } + } else { + int enc_idx; + if ('1'== cp[0] && '6'==cp[1]) { + cp += 2; + enc_idx = UTF_16; + } else if ('3'== cp[0] && '2'==cp[1]) { + cp += 2; + enc_idx = UTF_32; + } else { + output_encoding = nkf_enc_from_index(UTF_8); + continue; + } + if (cp[0]=='L') { + cp++; + output_endian = ENDIAN_LITTLE; + output_bom_f = TRUE; + } else if (cp[0] == 'B') { + cp++; + output_bom_f = TRUE; + } + if (cp[0] == '0'){ + output_bom_f = FALSE; + cp++; + enc_idx = enc_idx == UTF_16 + ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE) + : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE); + } else { + enc_idx = enc_idx == UTF_16 + ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM) + : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM); + } + output_encoding = nkf_enc_from_index(enc_idx); + } + continue; +#endif +#ifdef UTF8_INPUT_ENABLE + case 'W': /* UTF input */ + if (cp[0] == '8') { + cp++; + input_encoding = nkf_enc_from_index(UTF_8); + }else{ + int enc_idx; + if ('1'== cp[0] && '6'==cp[1]) { + cp += 2; + input_endian = ENDIAN_BIG; + enc_idx = UTF_16; + } else if ('3'== cp[0] && '2'==cp[1]) { + cp += 2; + input_endian = ENDIAN_BIG; + enc_idx = UTF_32; + } else { + input_encoding = nkf_enc_from_index(UTF_8); + continue; + } + if (cp[0]=='L') { + cp++; + input_endian = ENDIAN_LITTLE; + } else if (cp[0] == 'B') { + cp++; + input_endian = ENDIAN_BIG; + } + enc_idx = (enc_idx == UTF_16 + ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE) + : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE)); + input_encoding = nkf_enc_from_index(enc_idx); + } + continue; +#endif + /* Input code assumption */ + case 'J': /* ISO-2022-JP input */ + input_encoding = nkf_enc_from_index(ISO_2022_JP); + continue; + case 'E': /* EUC-JP input */ + input_encoding = nkf_enc_from_index(EUCJP_NKF); + continue; + case 'S': /* Shift_JIS input */ + input_encoding = nkf_enc_from_index(SHIFT_JIS); + continue; + case 'Z': /* Convert X0208 alphabet to asii */ + /* alpha_f + bit:0 Convert JIS X 0208 Alphabet to ASCII + bit:1 Convert Kankaku to one space + bit:2 Convert Kankaku to two spaces + bit:3 Convert HTML Entity + bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana + */ + while ('0'<= *cp && *cp <='4') { + alpha_f |= 1 << (*cp++ - '0'); + } + alpha_f |= 1; + continue; + case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */ + x0201_f = FALSE; /* No X0201->X0208 conversion */ + /* accept X0201 + ESC-(-I in JIS, EUC, MS Kanji + SI/SO in JIS, EUC, MS Kanji + SS2 in EUC, JIS, not in MS Kanji + MS Kanji (0xa0-0xdf) + output X0201 + ESC-(-I in JIS (0x20-0x5f) + SS2 in EUC (0xa0-0xdf) + 0xa0-0xd in MS Kanji (0xa0-0xdf) + */ + continue; + case 'X': /* Convert X0201 kana to X0208 */ + x0201_f = TRUE; + continue; + case 'F': /* prserve new lines */ + fold_preserve_f = TRUE; + case 'f': /* folding -f60 or -f */ + fold_f = TRUE; + fold_len = 0; + while('0'<= *cp && *cp <='9') { /* we don't use atoi here */ + fold_len *= 10; + fold_len += *cp++ - '0'; + } + if (!(0= *cp && *cp>='0') + broken_f |= 1<<(*cp++ -'0'); + else + broken_f |= TRUE; + continue; +#ifndef PERL_XS + case 'O':/* for Output file */ + file_out_f = TRUE; + continue; +#endif + case 'c':/* add cr code */ + eolmode_f = CRLF; + continue; + case 'd':/* delete cr code */ + eolmode_f = LF; + continue; + case 'I': /* ISO-2022-JP output */ + iso2022jp_f = TRUE; + continue; + case 'L': /* line mode */ + if (*cp=='u') { /* unix */ + eolmode_f = LF; cp++; + } else if (*cp=='m') { /* mac */ + eolmode_f = CR; cp++; + } else if (*cp=='w') { /* windows */ + eolmode_f = CRLF; cp++; + } else if (*cp=='0') { /* no conversion */ + eolmode_f = 0; cp++; + } + continue; +#ifndef PERL_XS + case 'g': + if ('2' <= *cp && *cp <= '9') { + guess_f = 2; + cp++; + } else if (*cp == '0' || *cp == '1') { + guess_f = 1; + cp++; + } else { + guess_f = 1; + } + continue; +#endif + case SP: + /* module muliple options in a string are allowed for Perl moudle */ + while(*cp && *cp++!='-'); + continue; + default: +#if !defined(PERL_XS) && !defined(WIN32DLL) + fprintf(stderr, "unknown option: -%c\n", *(cp-1)); #endif -#ifdef UTF8_OUTPUT_ENABLE - output_bom_f = FALSE; - output_endian = ENDIAN_BIG; + /* bogus option but ignored */ + return -1; + } + } + return 0; +} + +#ifdef WIN32DLL +#include "nkf32dll.c" +#elif defined(PERL_XS) +#else /* WIN32DLL */ +int +main(int argc, char **argv) +{ + FILE *fin; + unsigned char *cp; + + char *outfname = NULL; + char *origfname; + +#ifdef EASYWIN /*Easy Win */ + _BufferSize.y = 400;/*Set Scroll Buffer Size*/ #endif -#ifdef UNICODE_NORMALIZATION - nfc_f = FALSE; +#ifdef DEFAULT_CODE_LOCALE + setlocale(LC_CTYPE, ""); #endif -#ifdef INPUT_OPTION - cap_f = FALSE; - url_f = FALSE; - numchar_f = FALSE; + nkf_state_init(); + + for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) { + cp = (unsigned char *)*argv; + options(cp); +#ifdef EXEC_IO + if (exec_f){ + int fds[2], pid; + if (pipe(fds) < 0 || (pid = fork()) < 0){ + abort(); + } + if (pid == 0){ + if (exec_f > 0){ + close(fds[0]); + dup2(fds[1], 1); + }else{ + close(fds[1]); + dup2(fds[0], 0); + } + execvp(argv[1], &argv[1]); + } + if (exec_f > 0){ + close(fds[1]); + dup2(fds[0], 0); + }else{ + close(fds[0]); + dup2(fds[1], 1); + } + argc = 0; + break; + } #endif + } + + if (guess_f) { #ifdef CHECK_OPTION - noout_f = FALSE; - debug_f = FALSE; + int debug_f_back = debug_f; #endif - guess_f = 0; #ifdef EXEC_IO - exec_f = 0; -#endif -#ifdef SHIFTJIS_CP932 - cp51932_f = TRUE; - cp932inv_f = TRUE; + int exec_f_back = exec_f; #endif #ifdef X0212_ENABLE - x0212_f = FALSE; - x0213_f = FALSE; + int x0212_f_back = x0212_f; #endif - { - int i; - for (i = 0; i < 256; i++){ - prefix_table[i] = 0; - } - } - hold_count = 0; - mimeout_buf_count = 0; - mimeout_mode = 0; - base64_count = 0; - f_line = 0; - f_prev = 0; - fold_preserve_f = FALSE; - fold_f = FALSE; - fold_len = 0; - kanji_intro = DEFAULT_J; - ascii_intro = DEFAULT_R; - fold_margin = FOLD_MARGIN; - oconv = DEFAULT_CONV; - o_zconv = no_connection; - o_fconv = no_connection; - o_nlconv = no_connection; - o_rot_conv = no_connection; - o_hira_conv = no_connection; - o_base64conv = no_connection; - o_iso2022jp_check_conv = no_connection; - o_putc = std_putc; - i_getc = std_getc; - i_ungetc = std_ungetc; - i_bgetc = std_getc; - i_bungetc = std_ungetc; - o_mputc = std_putc; - i_mgetc = std_getc; - i_mungetc = std_ungetc; - i_mgetc_buf = std_getc; - i_mungetc_buf = std_ungetc; - output_mode = ASCII; - input_mode = ASCII; - shift_mode = FALSE; - mime_decode_mode = FALSE; - file_out_f = FALSE; - nlmode_f = 0; - input_newline = 0; - prev_cr = 0; - option_mode = 0; - broken_counter = 0; - broken_last = 0; - z_prev2=0,z_prev1=0; + int x0213_f_back = x0213_f; + int guess_f_back = guess_f; + reinit(); + guess_f = guess_f_back; + mime_f = FALSE; #ifdef CHECK_OPTION - iconv_for_check = 0; + debug_f = debug_f_back; #endif - input_codename = NULL; - input_encoding = NULL; - output_encoding = nkf_enc_from_index(DEFAULT_ENCODING); -#ifdef WIN32DLL - reinitdll(); -#endif /*WIN32DLL*/ -} +#ifdef EXEC_IO + exec_f = exec_f_back; +#endif + x0212_f = x0212_f_back; + x0213_f = x0213_f_back; + } -void no_connection(nkf_char c2, nkf_char c1) -{ - no_connection2(c2,c1,0); -} + if (binmode_f == TRUE) +#if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) + if (freopen("","wb",stdout) == NULL) + return (-1); +#else + setbinmode(stdout); +#endif -nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0) -{ - fprintf(stderr,"nkf internal module connection failure.\n"); - exit(1); - return 0; /* LINT */ -} + if (unbuf_f) + setbuf(stdout, (char *) NULL); + else + setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE); -#ifndef PERL_XS -#ifdef WIN32DLL -#define fprintf dllprintf -#endif -void usage(void) -{ - fprintf(HELP_OUTPUT,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"); - fprintf(HELP_OUTPUT,"Flags:\n"); - fprintf(HELP_OUTPUT,"b,u Output is buffered (DEFAULT),Output is unbuffered\n"); -#ifdef DEFAULT_CODE_SJIS - fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift_JIS (DEFAULT), EUC-JP, UTF-8N\n"); -#endif -#ifdef DEFAULT_CODE_JIS - fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit (DEFAULT), Shift JIS, EUC-JP, UTF-8N\n"); -#endif -#ifdef DEFAULT_CODE_EUC - fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP (DEFAULT), UTF-8N\n"); + if (argc == 0) { + if (binmode_f == TRUE) +#if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) + if (freopen("","rb",stdin) == NULL) return (-1); +#else + setbinmode(stdin); #endif -#ifdef DEFAULT_CODE_UTF8 - fprintf(HELP_OUTPUT,"j,s,e,w Output code is JIS 7 bit, Shift JIS, EUC-JP, UTF-8N (DEFAULT)\n"); + setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE); + if (nop_f) + noconvert(stdin); + else { + kanji_convert(stdin); + if (guess_f) print_guessed_code(NULL); + } + } else { + int nfiles = argc; + int is_argument_error = FALSE; + while (argc--) { + input_codename = NULL; + input_eol = 0; +#ifdef CHECK_OPTION + iconv_for_check = 0; #endif -#ifdef UTF8_OUTPUT_ENABLE - fprintf(HELP_OUTPUT," After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"); + if ((fin = fopen((origfname = *argv++), "r")) == NULL) { + perror(*(argv-1)); + is_argument_error = TRUE; + continue; + } else { +#ifdef OVERWRITE + int fd = 0; + int fd_backup = 0; #endif - fprintf(HELP_OUTPUT,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"); -#ifdef UTF8_INPUT_ENABLE - fprintf(HELP_OUTPUT," After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"); -#endif - fprintf(HELP_OUTPUT,"t no conversion\n"); - fprintf(HELP_OUTPUT,"i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"); - fprintf(HELP_OUTPUT,"o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"); - fprintf(HELP_OUTPUT,"r {de/en}crypt ROT13/47\n"); - fprintf(HELP_OUTPUT,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"); - fprintf(HELP_OUTPUT,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n"); - fprintf(HELP_OUTPUT,"M[BQ] MIME encode [B:base64 Q:quoted]\n"); - fprintf(HELP_OUTPUT,"l ISO8859-1 (Latin-1) support\n"); - fprintf(HELP_OUTPUT,"f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"); - fprintf(HELP_OUTPUT,"Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"); - fprintf(HELP_OUTPUT," 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"); - fprintf(HELP_OUTPUT," 4: JISX0208 Katakana to JISX0201 Katakana\n"); - fprintf(HELP_OUTPUT,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"); - fprintf(HELP_OUTPUT,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"); + + /* reopen file for stdout */ + if (file_out_f == TRUE) { +#ifdef OVERWRITE + if (overwrite_f){ + outfname = nkf_xmalloc(strlen(origfname) + + strlen(".nkftmpXXXXXX") + + 1); + strcpy(outfname, origfname); #ifdef MSDOS - fprintf(HELP_OUTPUT,"T Text mode output\n"); -#endif - fprintf(HELP_OUTPUT,"O Output to File (DEFAULT 'nkf.out')\n"); - fprintf(HELP_OUTPUT,"I Convert non ISO-2022-JP charactor to GETA\n"); - fprintf(HELP_OUTPUT,"d,c Convert line breaks -d: LF -c: CRLF\n"); - fprintf(HELP_OUTPUT,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"); - fprintf(HELP_OUTPUT,"v, V Show this usage. V: show configuration\n"); - fprintf(HELP_OUTPUT,"\n"); - fprintf(HELP_OUTPUT,"Long name options\n"); - fprintf(HELP_OUTPUT," --ic= --oc=\n"); - fprintf(HELP_OUTPUT," Specify the input or output codeset\n"); - fprintf(HELP_OUTPUT," --fj --unix --mac --windows\n"); - fprintf(HELP_OUTPUT," --jis --euc --sjis --utf8 --utf16 --mime --base64\n"); - fprintf(HELP_OUTPUT," Convert for the system or code\n"); - fprintf(HELP_OUTPUT," --hiragana --katakana --katakana-hiragana\n"); - fprintf(HELP_OUTPUT," To Hiragana/Katakana Conversion\n"); - fprintf(HELP_OUTPUT," --prefix= Insert escape before troublesome characters of Shift_JIS\n"); -#ifdef INPUT_OPTION - fprintf(HELP_OUTPUT," --cap-input, --url-input Convert hex after ':' or '%%'\n"); -#endif -#ifdef NUMCHAR_OPTION - fprintf(HELP_OUTPUT," --numchar-input Convert Unicode Character Reference\n"); + { + int i; + for (i = strlen(outfname); i; --i){ + if (outfname[i - 1] == '/' + || outfname[i - 1] == '\\'){ + break; + } + } + outfname[i] = '\0'; + } + strcat(outfname, "ntXXXXXX"); + mktemp(outfname); + fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, + S_IREAD | S_IWRITE); +#else + strcat(outfname, ".nkftmpXXXXXX"); + fd = mkstemp(outfname); #endif -#ifdef UTF8_INPUT_ENABLE - fprintf(HELP_OUTPUT," --fb-{skip, html, xml, perl, java, subchar}\n"); - fprintf(HELP_OUTPUT," Specify how nkf handles unassigned characters\n"); + if (fd < 0 + || (fd_backup = dup(fileno(stdout))) < 0 + || dup2(fd, fileno(stdout)) < 0 + ){ + perror(origfname); + return -1; + } + }else #endif -#ifdef OVERWRITE - fprintf(HELP_OUTPUT," --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"); - fprintf(HELP_OUTPUT," Overwrite original listed files by filtered result\n"); - fprintf(HELP_OUTPUT," --overwrite preserves timestamp of original files\n"); -#endif - fprintf(HELP_OUTPUT," -g --guess Guess the input code\n"); - fprintf(HELP_OUTPUT," --help --version Show this help/the version\n"); - fprintf(HELP_OUTPUT," For more information, see also man nkf\n"); - fprintf(HELP_OUTPUT,"\n"); - version(); -} + if(argc == 1) { + outfname = *argv++; + argc--; + } else { + outfname = "nkf.out"; + } -void show_configuration(void) -{ - fprintf(HELP_OUTPUT, "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"); - fprintf(HELP_OUTPUT, " Compile-time options:\n"); - fprintf(HELP_OUTPUT, " Default output encoding: " -#if defined(DEFAULT_CODE_JIS) - "ISO-2022-JP" -#elif defined(DEFAULT_CODE_SJIS) - "Shift_JIS" -#elif defined(DEFAULT_CODE_EUC) - "EUC-JP" -#elif defined(DEFAULT_CODE_UTF8) - "UTF-8" -#endif - "\n"); - fprintf(HELP_OUTPUT, " Default output newline: " -#if DEFAULT_NEWLINE == CR - "CR" -#elif DEFAULT_NEWLINE == CRLF - "CRLF" + if(freopen(outfname, "w", stdout) == NULL) { + perror (outfname); + return (-1); + } + if (binmode_f == TRUE) { +#if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) + if (freopen("","wb",stdout) == NULL) + return (-1); #else - "LF" + setbinmode(stdout); #endif - "\n"); - fprintf(HELP_OUTPUT, " Decode MIME encoded string: " -#if MIME_DECODE_DEFAULT - "ON" + } + } + if (binmode_f == TRUE) +#if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__)) + if (freopen("","rb",fin) == NULL) + return (-1); #else - "OFF" -#endif - "\n"); - fprintf(HELP_OUTPUT, " Convert JIS X 0201 Katakana: " -#if X0201_DEFAULT - "ON" + setbinmode(fin); +#endif + setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE); + if (nop_f) + noconvert(fin); + else { + char *filename = NULL; + kanji_convert(fin); + if (nfiles > 1) filename = origfname; + if (guess_f) print_guessed_code(filename); + } + fclose(fin); +#ifdef OVERWRITE + if (overwrite_f) { + struct stat sb; +#if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__) + time_t tb[2]; #else - "OFF" + struct utimbuf tb; #endif - "\n"); - fprintf(HELP_OUTPUT, " --help, --version output: " -#if HELP_OUTPUT_HELP_OUTPUT -"HELP_OUTPUT" + + fflush(stdout); + close(fd); + if (dup2(fd_backup, fileno(stdout)) < 0){ + perror("dup2"); + } + if (stat(origfname, &sb)) { + fprintf(stderr, "Can't stat %s\n", origfname); + } + /* パーミッションを復元 */ + if (chmod(outfname, sb.st_mode)) { + fprintf(stderr, "Can't set permission %s\n", outfname); + } + + /* タイムスタンプを復元 */ + if(preserve_time_f){ +#if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__) + tb[0] = tb[1] = sb.st_mtime; + if (utime(outfname, tb)) { + fprintf(stderr, "Can't set timestamp %s\n", outfname); + } #else -"STDOUT" + tb.actime = sb.st_atime; + tb.modtime = sb.st_mtime; + if (utime(outfname, &tb)) { + fprintf(stderr, "Can't set timestamp %s\n", outfname); + } #endif -"\n"); -} - -void version(void) -{ - fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n"); + } + if(backup_f){ + char *backup_filename = get_backup_filename(backup_suffix, origfname); +#ifdef MSDOS + unlink(backup_filename); +#endif + if (rename(origfname, backup_filename)) { + perror(backup_filename); + fprintf(stderr, "Can't rename %s to %s\n", + origfname, backup_filename); + } + nkf_xfree(backup_filename); + }else{ +#ifdef MSDOS + if (unlink(origfname)){ + perror(origfname); + } +#endif + } + if (rename(outfname, origfname)) { + perror(origfname); + fprintf(stderr, "Can't rename %s to %s\n", + outfname, origfname); + } + nkf_xfree(outfname); + } +#endif + } + } + if (is_argument_error) + return(-1); + } +#ifdef EASYWIN /*Easy Win */ + if (file_out_f == FALSE) + scanf("%d",&end_check); + else + fclose(stdout); +#else /* for Other OS */ + if (file_out_f == TRUE) + fclose(stdout); +#endif /*Easy Win */ + return (0); } -#endif /*PERL_XS*/ +#endif /* WIN32DLL */