** E-Mail: furukawa@tcp-ip.or.jp
** \e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#\e(B
***********************************************************************/
-/* $Id: nkf.c,v 1.50 2004/12/29 21:04:23 naruse Exp $ */
-#define NKF_VERSION "2.0.4"
-#define NKF_RELEASE_DATE "2004-12-01"
+/* $Id: nkf.c,v 1.72 2005/07/10 04:36:50 naruse Exp $ */
+#define NKF_VERSION "2.0.5"
+#define NKF_RELEASE_DATE "2005-07-10"
#include "config.h"
static char *CopyRight =
- "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW, 2002-2004 Kono, Furukawa";
+ "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW, 2002-2005 Kono, Furukawa, Naruse";
/*
#ifdef PERL_XS
#undef OVERWRITE
#endif
+#if defined( UTF8_OUTPUT_ENABLE ) || defined( UTF8_INPUT_ENABLE )
+#define UNICODE_ENABLE
+#else
+#undef UNICODE_NORMALIZATION
+#endif
#ifndef PERL_XS
#include <stdio.h>
#ifndef MSDOS /* UNIX, OS/2 */
#include <unistd.h>
#include <utime.h>
-#else
+#else /* defined(MSDOS) */
+#ifdef __WIN32__
+#ifdef __BORLANDC__ /* BCC32 */
+#include <utime.h>
+#else /* !defined(__BORLANDC__) */
+#include <sys/utime.h>
+#endif /* (__BORLANDC__) */
+#else /* !defined(__WIN32__) */
#if defined(_MSC_VER) || defined(__MINGW32__) /* VC++, MinGW */
#include <sys/utime.h>
#elif defined(__TURBOC__) /* BCC */
#include <utime.h>
#elif defined(LSI_C) /* LSI C */
+#endif /* (__WIN32__) */
#endif
#endif
#endif
#define UTF8 12
#define UTF8_INPUT 13
-#define UTF16LE_INPUT 14
-#define UTF16BE_INPUT 15
+#define UTF16BE_INPUT 14
+#define UTF16LE_INPUT 15
#define WISH_TRUE 15
#define GETA2 0x2e
-#if defined( UTF8_OUTPUT_ENABLE ) || defined( UTF8_INPUT_ENABLE )
+#ifdef UNICODE_ENABLE
#define sizeof_euc_utf8 94
#define sizeof_euc_to_utf8_1byte 94
#define sizeof_euc_to_utf8_2bytes 94
STATIC int kanji_convert PROTO((FILE *f));
STATIC int h_conv PROTO((FILE *f,int c2,int c1));
STATIC int push_hold_buf PROTO((int c2));
-STATIC void set_iconv PROTO((int f, int (*iconv_func)()));
+STATIC void set_iconv PROTO((int f, int (*iconv_func)(int c2,int c1,int c0)));
STATIC int s_iconv PROTO((int c2,int c1,int c0));
STATIC int s2e_conv PROTO((int c2, int c1, int *p2, int *p1));
STATIC int e_iconv PROTO((int c2,int c1,int c0));
STATIC int mime_integrity PROTO((FILE *f,unsigned char *p));
STATIC int base64decode PROTO((int c));
+STATIC void mime_prechar PROTO((int c2, int c1));
STATIC void mime_putc PROTO((int c));
STATIC void open_mime PROTO((int c));
STATIC void close_mime PROTO(());
STATIC void usage PROTO(());
STATIC void version PROTO(());
STATIC void options PROTO((unsigned char *c));
-#ifdef PERL_XS
+#if defined(PERL_XS) || defined(WIN32DLL)
STATIC void reinit PROTO(());
#endif
static unsigned int mime_top = 0;
static unsigned int mime_last = 0; /* decoded */
static unsigned int mime_input = 0; /* undecoded */
+static int (*mime_iconv_back)PROTO((int c2,int c1,int c0)) = NULL;
/* flags */
static int unbuf_f = FALSE;
static int input_f = FALSE; /* non fixed input code */
static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
static int mime_f = STRICT_MIME; /* convert MIME B base64 or Q */
+static int mime_decode_f = FALSE; /* mime decode is explicitly on */
static int mimebuf_f = FALSE; /* MIME buffered input */
static int broken_f = FALSE; /* convert ESC-less broken JIS */
static int iso8859_f = FALSE; /* ISO8859 through */
static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
#endif
static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
+#ifdef UNICODE_ENABLE
+static int internal_unicode_f = FALSE; /* Internal Unicode Processing */
+#endif
#ifdef UTF8_OUTPUT_ENABLE
static int unicode_bom_f= 0; /* Output Unicode BOM */
static int w_oconv16_LE = 0; /* utf-16 little endian */
static int ms_ucs_map_f = FALSE; /* Microsoft UCS Mapping Compatible */
#endif
-
-#ifdef NUMCHAR_OPTION
-
-#define CLASS_MASK 0x0f000000
-#define CLASS_UTF16 0x01000000
+#ifdef UNICODE_NORMALIZATION
+static int nfc_f = FALSE;
+static int (*i_nfc_getc)PROTO((FILE *)) = std_getc; /* input of ugetc */
+static int (*i_nfc_ungetc)PROTO((int c ,FILE *f)) = std_ungetc;
+STATIC int nfc_getc PROTO((FILE *f));
+STATIC int nfc_ungetc PROTO((int c,FILE *f));
#endif
#ifdef INPUT_OPTION
static int (*i_uungetc)PROTO((int c ,FILE *f)) = std_ungetc;
STATIC int url_getc PROTO((FILE *f));
STATIC int url_ungetc PROTO((int c,FILE *f));
+#endif
+#ifdef NUMCHAR_OPTION
+#define CLASS_MASK 0x0f000000
+#define CLASS_UTF16 0x01000000
static int numchar_f = FALSE;
static int (*i_ngetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
static int (*i_nungetc)PROTO((int c ,FILE *f)) = std_ungetc;
STATIC void no_putc PROTO((int c));
static int debug_f = FALSE;
STATIC void debug PROTO((char *str));
+static int (*iconv_for_check)() = 0;
#endif
static int guess_f = FALSE;
#ifdef UTF8_INPUT_ENABLE
STATIC void w_status PROTO((struct input_code *, int));
STATIC void w16_status PROTO((struct input_code *, int));
-static int utf16_mode = UTF16LE_INPUT;
+static int utf16_mode = UTF16BE_INPUT;
#endif
struct input_code input_code_list[] = {
static int end_check;
#endif /*Easy Win */
-#ifndef PERL_XS
+#define STD_GC_BUFSIZE (256)
+int std_gc_buf[STD_GC_BUFSIZE];
+int std_gc_ndx;
+
+#ifdef WIN32DLL
+#include "nkf32dll.c"
+#elif defined(PERL_XS)
+#else /* WIN32DLL */
int
main(argc, argv)
int argc;
} else {
int nfiles = argc;
while (argc--) {
+ is_inputcode_mixed = FALSE;
+ is_inputcode_set = FALSE;
+ input_codename = "";
+#ifdef CHECK_OPTION
+ iconv_for_check = 0;
+#endif
if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
perror(*--argv);
return(-1);
#ifdef OVERWRITE
if (overwrite) {
struct stat sb;
-#if defined(MSDOS) && !defined(__MINGW32__)
+#if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
time_t tb[2];
#else
struct utimbuf tb;
}
/* \e$B%?%$%`%9%?%s%W$rI|85\e(B */
-#if defined(MSDOS) && !defined(__MINGW32__)
+#if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__)
tb[0] = tb[1] = sb.st_mtime;
if (utime(outfname, tb)) {
fprintf(stderr, "Can't set timestamp %s\n", outfname);
#else /* for Other OS */
if (file_out == TRUE)
fclose(stdout);
-#endif
+#endif /*Easy Win */
return (0);
}
-#endif
+#endif /* WIN32DLL */
static
struct {
#ifdef X0212_ENABLE
{"x0212", ""},
#endif
+#ifdef UNICODE_ENABLE
+ {"internal-unicode", ""},
+#endif
#ifdef UTF8_OUTPUT_ENABLE
{"utf8", "w"},
{"utf16", "w16"},
{"utf8-input", "W"},
{"utf16-input", "W16"},
#endif
+#ifdef UNICODE_NORMALIZATION
+ {"utf8mac-input", ""},
+#endif
#ifdef OVERWRITE
{"overwrite", ""},
#endif
if (option_mode==1)
return;
- if (*cp++ != '-')
- return;
+ while(*cp && *cp++!='-');
while (*cp) {
- if (p && !*cp) {
- cp = p;
- p = 0;
- }
+ p = 0;
switch (*cp++) {
case '-': /* literal options */
if (!*cp) { /* ignore the rest of arguments */
for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
int j;
p = (unsigned char *)long_option[i].name;
- for (j=0;*p && (*p != '=') && *p == cp[j];p++, j++);
- if (!*p || *p == cp[j]){
+ for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
+ if (*p == cp[j] || cp[j] == ' '){
p = &cp[j];
break;
}
if (p == 0) return;
cp = (unsigned char *)long_option[i].alias;
if (!*cp){
+ cp = p;
#ifdef OVERWRITE
if (strcmp(long_option[i].name, "overwrite") == 0){
file_out = TRUE;
return;
}
#endif
+#ifdef UNICODE_ENABLE
+ if (strcmp(long_option[i].name, "internal-unicode") == 0){
+ internal_unicode_f = TRUE;
+ continue;
+ }
+#endif
#ifdef UTF8_OUTPUT_ENABLE
if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
ms_ucs_map_f = TRUE;
continue;
}
#endif
+#ifdef UNICODE_NORMALIZATION
+ if (strcmp(long_option[i].name, "utf8mac-input") == 0){
+ input_f = UTF8_INPUT;
+ nfc_f = TRUE;
+ continue;
+ }
+#endif
if (strcmp(long_option[i].name, "prefix=") == 0){
if (*p == '=' && ' ' < p[1] && p[1] < 128){
for (i = 2; ' ' < p[i] && p[i] < 128; i++){
continue;
case 'h':
/*
- bit:1 hira -> kata
- bit:2 kata -> hira
+ bit:1 katakana->hiragana
+ bit:2 hiragana->katakana
*/
if ('9'>= *cp && *cp>='0')
hira_f |= (*cp++ -'0');
#ifdef UTF8_INPUT_ENABLE
case 'W': /* UTF-8 input */
if ('1'== cp[0] && '6'==cp[1]) {
- input_f = UTF16LE_INPUT;
+ input_f = UTF16BE_INPUT;
+ utf16_mode = UTF16BE_INPUT;
+ cp += 2;
if (cp[0]=='L') {
cp++;
+ input_f = UTF16LE_INPUT;
+ utf16_mode = UTF16LE_INPUT;
} else if (cp[0] == 'B') {
cp++;
input_f = UTF16BE_INPUT;
+ utf16_mode = UTF16BE_INPUT;
}
} else if (cp[0] == '8') {
cp++;
}
continue;
case 'm': /* MIME support */
+ /* mime_decode_f = TRUE; */ /* this has too large side effects... */
if (*cp=='B'||*cp=='Q') {
mime_decode_mode = *cp++;
mimebuf_f = FIXED_MIME;
} else if (*cp=='S') {
mime_f = STRICT_MIME; cp++;
} else if (*cp=='0') {
+ mime_decode_f = FALSE;
mime_f = FALSE; cp++;
}
continue;
continue;
case ' ':
/* module muliple options in a string are allowed for Perl moudle */
- while(*cp && *cp!='-') cp++;
- if(*cp=='-') cp++;
+ while(*cp && *cp++!='-');
continue;
default:
/* bogus option but ignored */
int (*iconv_func)();
#endif
{
-#ifdef CHECK_OPTION
- static int (*iconv_for_check)() = 0;
-#endif
#ifdef INPUT_CODE_FIX
if (f || !input_f)
#endif
}
}
-#define STD_GC_BUFSIZE (256)
-int std_gc_buf[STD_GC_BUFSIZE];
-int std_gc_ndx;
-
+#ifndef WIN32DLL
int
std_getc(f)
FILE *f;
}
return getc(f);
}
+#endif /*WIN32DLL*/
int
std_ungetc(c,f)
return c;
}
+#ifndef WIN32DLL
void
std_putc(c)
int c;
if(c!=EOF)
putchar(c);
}
+#endif /*WIN32DLL*/
int
noconvert(f)
i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
}
#endif
+#ifdef UNICODE_NORMALIZATION
+ if (nfc_f && input_f == UTF8_INPUT){
+ i_nfc_getc = i_getc; i_getc = nfc_getc;
+ i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
+ }
+#endif
if (mime_f && mimebuf_f==FIXED_MIME) {
i_mgetc = i_getc; i_getc = mime_getc;
i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
#ifdef UTF8_INPUT_ENABLE
} else if (input_f == UTF8_INPUT) {
set_iconv(-TRUE, w_iconv);
+ } else if (input_f == UTF16BE_INPUT) {
+ set_iconv(-TRUE, w_iconv16);
} else if (input_f == UTF16LE_INPUT) {
set_iconv(-TRUE, w_iconv16);
#endif
{
int c1,
c2, c3;
+ int is_8bit = FALSE;
module_connection();
c2 = 0;
/* second byte */
if (c2 > DEL) {
/* in case of 8th bit is on */
- if (!estab_f) {
+ if (!estab_f&&!mime_decode_mode) {
/* in case of not established yet */
/* It is still ambiguious */
if (h_conv(f, c2, c1)==EOF)
/* 8 bit code */
if (!estab_f && !iso8859_f) {
/* not established yet */
+ if (!is_8bit) is_8bit = TRUE;
c2 = c1;
NEXT;
} else { /* estab_f==TRUE */
} else if ((c1 == NL || c1 == CR) && broken_f&4) {
input_mode = ASCII; set_iconv(FALSE, 0);
SEND;
- /*
- } else if (c1 == NL && mime_f && !mime_decode_mode ) {
+ } else if (c1 == NL && mime_decode_f && !mime_decode_mode ) {
if ((c1=(*i_getc)(f))!=EOF && c1 == SPACE) {
i_ungetc(SPACE,f);
continue;
}
c1 = NL;
SEND;
- } else if (c1 == CR && mime_f && !mime_decode_mode ) {
+ } else if (c1 == CR && mime_decode_f && !mime_decode_mode ) {
if ((c1=(*i_getc)(f))!=EOF) {
if (c1==SPACE) {
i_ungetc(SPACE,f);
}
c1 = CR;
SEND;
- */
} else
SEND;
}
/* epilogue */
(*iconv)(EOF, 0, 0);
+ if (!is_inputcode_set)
+ {
+ if (is_8bit) {
+ struct input_code *p = input_code_list;
+ struct input_code *result = p;
+ while (p->name){
+ if (p->score < result->score) result = p;
+ ++p;
+ }
+ set_input_codename(result->name);
+ }
+ }
return 1;
}
int c2,
c1, c0;
{
- int ret = w2e_conv(c2, c1, c0, &c2, &c1);
+ int ret = 0;
+ unsigned short val = 0;
+
+ if (c0 == 0){
+ if (c2 < 0x80 || (c2 & 0xc0) == 0xdf) /* 0x00-0x7f 0xc0-0xdf */
+ ; /* 1 or 2ytes */
+ else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
+ return -1; /* 3bytes */
+#ifdef __COMMENT__
+ else if (0xf0 <= c2)
+ return 0; /* 4,5,6bytes */
+ else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
+ return 0; /* trail byte */
+#endif
+ else return 0;
+ }
+ if (c2 == EOF);
+ else if (c2 == 0xef && c1 == 0xbb && c0 == 0xbf)
+ return 0; /* throw BOM */
+ else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
+ val = ww16_conv(c2, c1, c0);
+ c2 = (val >> 8) & 0xff;
+ c1 = val & 0xff;
+ } else {
+ ret = w2e_conv(c2, c1, c0, &c2, &c1);
+ }
if (ret == 0){
(*oconv)(c2, c1);
}
w_iconv16(c2, c1, c0)
int c2, c1,c0;
{
- int ret;
+ int ret = 0;
if (c2==0376 && c1==0377){
- utf16_mode = UTF16LE_INPUT;
+ utf16_mode = UTF16BE_INPUT;
return 0;
} else if (c2==0377 && c1==0376){
- utf16_mode = UTF16BE_INPUT;
+ utf16_mode = UTF16LE_INPUT;
return 0;
}
- if (c2 != EOF && utf16_mode == UTF16BE_INPUT) {
+ if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
int tmp;
tmp=c1; c1=c2; c2=tmp;
}
(*oconv)(c2, c1);
return 0;
}
- ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
+ if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16));
+ else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
if (ret) return ret;
(*oconv)(c2, c1);
return 0;
c1;
{
int c0;
+ unsigned short val;
if (c2 == EOF) {
(*o_putc)(EOF);
return;
output_mode = ISO8859_1;
(*o_putc)(c1 | 0x080);
} else {
- unsigned short val;
output_mode = UTF8;
- val = e2w_conv(c2, c1);
+ if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16))
+ val = ((c2<<8)&0xff00) + c1;
+ else val = e2w_conv(c2, c1);
if (val){
w16w_conv(val, &c2, &c1, &c0);
(*o_putc)(c2);
unicode_bom_f=1;
}
- if (c2 == ISO8859_1) {
+ if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)){
+ } else if (c2 == ISO8859_1) {
c2 = 0;
c1 |= 0x80;
#ifdef NUMCHAR_OPTION
int c2,
c1;
{
- if (base64_count>50 && !mimeout_mode && c2==0 && c1==SPACE) {
- (*o_putc)(EOF);
- (*o_putc)(NL);
- } else if (base64_count>66 && mimeout_mode) {
- (*o_base64conv)(EOF,0);
- (*o_base64conv)(NL,0);
- (*o_base64conv)(SPACE,0);
- }
+ mime_prechar(c2, c1);
(*o_base64conv)(c2,c1);
}
#define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
#define nkf_isdigit(c) ('0'<=c && c<='9')
#define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
+#define nkf_isblank(c) (c == SPACE || c == TAB)
+#define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == NL)
+#define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
+#define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
void
switch_mime_getc()
}
i_getc = i_mgetc;
i_ungetc = i_mungetc;
+ if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
+ mime_iconv_back = NULL;
}
int
}
mime_decode_mode = p[i-2];
+ mime_iconv_back = iconv;
+ set_iconv(FALSE, mime_priority_func[j]);
clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
if (mime_decode_mode=='B') {
is_inputcode_set = TRUE;
}
+#ifndef WIN32DLL
void
print_guessed_code (filename)
char *filename;
if (filename != NULL) printf("%s:", filename);
printf("%s\n", codename);
}
+#endif /*WIN32DLL*/
int
hex2bin(x)
}
#endif
+#ifdef UNICODE_NORMALIZATION
+
+/* Normalization Form C */
+int
+nfc_getc(f)
+ FILE *f;
+{
+ int (*g)() = i_nfc_getc;
+ int (*u)() = i_nfc_ungetc;
+ int i=0, j, k=1, lower, upper;
+ int buf[9];
+ int *array = NULL;
+ extern struct normalization_pair normalization_table[];
+
+ buf[i] = (*g)(f);
+ while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
+ lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
+ while (upper >= lower) {
+ j = (lower+upper) / 2;
+ array = normalization_table[j].nfd;
+ for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
+ if (array[k] != buf[k]){
+ array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
+ k = 0;
+ break;
+ } else if (k >= i)
+ buf[++i] = (*g)(f);
+ }
+ if (k > 0){
+ array = normalization_table[j].nfc;
+ for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
+ buf[i] = array[i];
+ i--;
+ break;
+ }
+ }
+ while (i > 0)
+ (*u)(buf[i--], f);
+ }
+ return buf[0];
+}
+
+int
+nfc_ungetc(c, f)
+ int c;
+ FILE *f;
+{
+ return (*i_nfc_ungetc)(c, f);
+}
+#endif /* UNICODE_NORMALIZATION */
+
int
mime_getc(f)
if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
restart_mime_q:
if (c1=='_') return ' ';
+ if (c1<=' ' || DEL<=c1) {
+ mime_decode_mode = FALSE; /* quit */
+ unswitch_mime_getc();
+ return c1;
+ }
if (c1!='=' && c1!='?') {
return c1;
}
mime_decode_mode = exit_mode; /* prepare for quit */
- if (c1<=' ') return c1;
if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
/* end Q encoding */
i = 0;
if (base64_count>45) {
+ if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
+ (*o_mputc)(mimeout_buf[i]);
+ i++;
+ }
(*o_mputc)(NL);
(*o_mputc)(SPACE);
base64_count = 1;
{
switch(mimeout_mode) {
case 'Q':
- if(c>=DEL) {
+ if(c==SPACE){
+ (*o_mputc)('_');
+ base64_count++;
+ } else if (c==CR||c==NL) {
+ (*o_mputc)(c);
+ base64_count = 0;
+ } else if(c<SPACE||c=='='||c=='?'||c=='_'||DEL<=c) {
(*o_mputc)('=');
(*o_mputc)(itoh4(((c>>4)&0xf)));
(*o_mputc)(itoh4((c&0xf)));
mimeout_mode='B';
base64_count += 2;
break;
+ default:
+ (*o_mputc)(c);
+ base64_count++;
+ break;
}
}
+int mime_lastchar2, mime_lastchar1;
+
+void mime_prechar(c2, c1)
+ int c2, c1;
+{
+ if (mimeout_mode){
+ if (c2){
+ if (base64_count + mimeout_buf_count/3*4> 66){
+ (*o_base64conv)(EOF,0);
+ (*o_base64conv)(0,NL);
+ (*o_base64conv)(0,SPACE);
+ }
+ }/*else if (mime_lastchar2){
+ if (c1 <=DEL && !nkf_isspace(c1)){
+ (*o_base64conv)(0,SPACE);
+ }
+ }*/
+ }/*else{
+ if (c2 && mime_lastchar2 == 0
+ && mime_lastchar1 && !nkf_isspace(mime_lastchar1)){
+ (*o_base64conv)(0,SPACE);
+ }
+ }*/
+ mime_lastchar2 = c2;
+ mime_lastchar1 = c1;
+}
+
void
mime_putc(c)
int c;
{
int i = 0;
int j = 0;
-
- if (mimeout_f==FIXED_MIME && base64_count>50) {
- eof_mime();
- (*o_mputc)(NL);
- base64_count=0;
- } else if (c==CR||c==NL) {
- base64_count=0;
- }
- if (c!=EOF && mimeout_f!=FIXED_MIME) {
- if ( c<=DEL &&(output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
- if (mimeout_mode=='Q') {
- if (c<=SPACE) {
- close_mime();
- (*o_mputc)(SPACE);
- base64_count++;
- }
- (*o_mputc)(c);
- base64_count++;
- return;
- } else if (mimeout_mode) {
- if (base64_count>63) {
- eof_mime();
- (*o_mputc)(NL);
- (*o_mputc)(SPACE);
- base64_count=1;
- mimeout_preserve_space = TRUE;
- }
- if (c==SPACE || c==TAB || c==CR || c==NL) {
- for (i=0;i<mimeout_buf_count;i++) {
- if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
- eof_mime();
- for (i=0;i<mimeout_buf_count;i++) {
- (*o_mputc)(mimeout_buf[i]);
- base64_count++;
- }
- mimeout_buf_count = 0;
- }
- }
- mimeout_buf[mimeout_buf_count++] = c;
- if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
- eof_mime();
- base64_count = 0;
- for (i=0;i<mimeout_buf_count;i++) {
- (*o_mputc)(mimeout_buf[i]);
- base64_count++;
- }
- }
- return;
- }
- if (mimeout_buf_count>0 && SPACE<c) {
- mimeout_buf[mimeout_buf_count++] = c;
- if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
- } else {
- return;
- }
- }
- } else if (!mimeout_mode) {
- if (c==SPACE || c==TAB || c==CR || c==NL) {
- if ((c==CR || c==NL)
- &&(mimeout_buf[mimeout_buf_count-1]==SPACE
- || mimeout_buf[mimeout_buf_count-1]==TAB)) {
- mimeout_buf_count--;
- }
- for (i=0;i<mimeout_buf_count;i++) {
- (*o_mputc)(mimeout_buf[i]);
- base64_count++;
- }
- mimeout_buf_count = 0;
- }
- mimeout_buf[mimeout_buf_count++] = c;
- if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
- open_mime(output_mode);
- }
- return;
- }
- } else if (!mimeout_mode) {
- if (mimeout_buf_count>0 && mimeout_buf[mimeout_buf_count-1]==SPACE) {
- for (i=0;i<mimeout_buf_count-1;i++) {
- (*o_mputc)(mimeout_buf[i]);
- base64_count++;
- }
- mimeout_buf[0] = SPACE;
- mimeout_buf_count = 1;
- }
- open_mime(output_mode);
+ int lastchar;
+
+ if (mimeout_f == FIXED_MIME){
+ if (mimeout_mode == 'Q'){
+ if (base64_count > 71){
+ if (c!=CR && c!=NL) {
+ (*o_mputc)('=');
+ (*o_mputc)(NL);
+ }
+ base64_count = 0;
+ }
+ }else{
+ if (base64_count > 71){
+ eof_mime();
+ (*o_mputc)(NL);
+ base64_count = 0;
+ }
+ if (c == EOF) { /* c==EOF */
+ eof_mime();
+ }
+ }
+ if (c != EOF) { /* c==EOF */
+ mimeout_addchar(c);
}
- } else if (c == EOF) { /* c==EOF */
+ return;
+ }
+
+ /* mimeout_f != FIXED_MIME */
+
+ if (c == EOF) { /* c==EOF */
j = mimeout_buf_count;
+ mimeout_buf_count = 0;
i = 0;
for (;i<j;i++) {
- if (mimeout_buf[i]==SPACE || mimeout_buf[i]==TAB
- || mimeout_buf[i]==CR || mimeout_buf[i]==NL)
+ /*if (nkf_isspace(mimeout_buf[i])){
break;
- (*mime_putc)(mimeout_buf[i]);
+ }*/
+ mimeout_addchar(mimeout_buf[i]);
}
eof_mime();
for (;i<j;i++) {
}
return;
}
-
+
+ if (mimeout_mode=='Q') {
+ if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
+ if (c <= SPACE) {
+ close_mime();
+ (*o_mputc)(SPACE);
+ base64_count++;
+ }
+ (*o_mputc)(c);
+ base64_count++;
+ }
+ return;
+ }
+
+ if (mimeout_buf_count > 0){
+ lastchar = mimeout_buf[mimeout_buf_count - 1];
+ }else{
+ lastchar = -1;
+ }
+
+ if (!mimeout_mode) {
+ if (c <= DEL && (output_mode==ASCII ||output_mode == ISO8859_1)) {
+ if (nkf_isspace(c)) {
+ if (c==CR || c==NL) {
+ base64_count=0;
+ }
+ for (i=0;i<mimeout_buf_count;i++) {
+ (*o_mputc)(mimeout_buf[i]);
+ if (mimeout_buf[i] == CR || mimeout_buf[i] == NL){
+ base64_count = 0;
+ }else{
+ base64_count++;
+ }
+ }
+ mimeout_buf[0] = c;
+ mimeout_buf_count = 1;
+ }else{
+ if (base64_count > 1
+ && base64_count + mimeout_buf_count > 76){
+ (*o_mputc)(NL);
+ base64_count = 0;
+ if (!nkf_isspace(mimeout_buf[0])){
+ (*o_mputc)(SPACE);
+ base64_count++;
+ }
+ }
+ mimeout_buf[mimeout_buf_count++] = c;
+ if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
+ open_mime(output_mode);
+ }
+ }
+ return;
+ }else{
+ if (lastchar==CR || lastchar == NL){
+ for (i=0;i<mimeout_buf_count;i++) {
+ (*o_mputc)(mimeout_buf[i]);
+ }
+ base64_count = 0;
+ mimeout_buf_count = 0;
+ }
+ if (lastchar==SPACE) {
+ for (i=0;i<mimeout_buf_count-1;i++) {
+ (*o_mputc)(mimeout_buf[i]);
+ base64_count++;
+ }
+ mimeout_buf[0] = SPACE;
+ mimeout_buf_count = 1;
+ }
+ open_mime(output_mode);
+ }
+ }else{
+ /* mimeout_mode == 'B', 1, 2 */
+ if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO8859_1 ) ) {
+ if (lastchar == CR || lastchar == NL){
+ if (nkf_isblank(c)) {
+ for (i=0;i<mimeout_buf_count;i++) {
+ mimeout_addchar(mimeout_buf[i]);
+ }
+ mimeout_buf_count = 0;
+ } else if (SPACE<c && c<DEL) {
+ eof_mime();
+ for (i=0;i<mimeout_buf_count;i++) {
+ (*o_mputc)(mimeout_buf[i]);
+ }
+ base64_count = 0;
+ mimeout_buf_count = 0;
+ }
+ }
+ if (c==SPACE || c==TAB || c==CR || c==NL) {
+ for (i=0;i<mimeout_buf_count;i++) {
+ if (SPACE<mimeout_buf[i] && mimeout_buf[i]<DEL) {
+ eof_mime();
+ for (i=0;i<mimeout_buf_count;i++) {
+ (*o_mputc)(mimeout_buf[i]);
+ base64_count++;
+ }
+ mimeout_buf_count = 0;
+ }
+ }
+ mimeout_buf[mimeout_buf_count++] = c;
+ if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
+ eof_mime();
+ for (i=0;i<mimeout_buf_count;i++) {
+ (*o_mputc)(mimeout_buf[i]);
+ base64_count++;
+ }
+ mimeout_buf_count = 0;
+ }
+ return;
+ }
+ if (mimeout_buf_count>0 && SPACE<c && c!='=') {
+ mimeout_buf[mimeout_buf_count++] = c;
+ if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
+ j = mimeout_buf_count;
+ mimeout_buf_count = 0;
+ for (i=0;i<j;i++) {
+ mimeout_addchar(mimeout_buf[i]);
+ }
+ }
+ return;
+ }
+ }
+ }
if (mimeout_buf_count>0) {
j = mimeout_buf_count;
mimeout_buf_count = 0;
for (i=0;i<j;i++) {
+ if (mimeout_buf[i]==CR || mimeout_buf[i]==NL)
+ break;
mimeout_addchar(mimeout_buf[i]);
}
+ if (i<j) {
+ eof_mime();
+ base64_count=0;
+ for (;i<j;i++) {
+ (*o_mputc)(mimeout_buf[i]);
+ }
+ open_mime(output_mode);
+ }
}
mimeout_addchar(c);
}
-#ifdef PERL_XS
+#if defined(PERL_XS) || defined(WIN32DLL)
void
reinit()
{
input_f = FALSE;
alpha_f = FALSE;
mime_f = STRICT_MIME;
+ mime_decode_f = FALSE;
mimebuf_f = FALSE;
broken_f = FALSE;
iso8859_f = FALSE;
x0201_f = NO_X0201;
#endif
iso2022jp_f = FALSE;
+#ifdef UNICODE_ENABLE
+ internal_unicode_f = TRUE;
+#endif
#ifdef UTF8_OUTPUT_ENABLE
unicode_bom_f = 0;
w_oconv16_LE = 0;
ms_ucs_map_f = FALSE;
#endif
+#ifdef UNICODE_NORMALIZATION
+ nfc_f = FALSE;
+#endif
#ifdef INPUT_OPTION
cap_f = FALSE;
url_f = FALSE;
}
}
#ifdef UTF8_INPUT_ENABLE
- utf16_mode = UTF16LE_INPUT;
+ utf16_mode = UTF16BE_INPUT;
#endif
mimeout_buf_count = 0;
mimeout_mode = 0;
broken_counter = 0;
broken_last = 0;
z_prev2=0,z_prev1=0;
-
+#ifdef CHECK_OPTION
+ iconv_for_check = 0;
+#endif
+ input_codename = "";
+#ifdef WIN32DLL
+ reinitdll();
+#endif /*WIN32DLL*/
}
#endif
{
fprintf(stderr,"nkf internal module connection failure.\n");
exit(1);
+ return 0; /* LINT */
}
#ifndef PERL_XS
+#ifdef WIN32DLL
+#define fprintf dllprintf
+#endif
void
usage()
{
fprintf(stderr,"Flags:\n");
fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
#ifdef DEFAULT_CODE_SJIS
- fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC), UTF-8\n");
+ fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC), UTF-8N\n");
#endif
#ifdef DEFAULT_CODE_JIS
- fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC), UTF-8\n");
+ fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC), UTF-8N\n");
#endif
#ifdef DEFAULT_CODE_EUC
- fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT), UTF-8\n");
+ fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT), UTF-8N\n");
#endif
#ifdef DEFAULT_CODE_UTF8
- fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC), UTF-8 (DEFAULT)\n");
+ fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC), UTF-8N (DEFAULT)\n");
#endif
#ifdef UTF8_OUTPUT_ENABLE
fprintf(stderr," After 'w' you can add more options. (80?|16((B|L)0?)?) \n");
fprintf(stderr,"t no conversion\n");
fprintf(stderr,"i_/o_ Output sequence to designate JIS-kanji/ASCII (DEFAULT B)\n");
fprintf(stderr,"r {de/en}crypt ROT13/47\n");
- fprintf(stderr,"h 1 hirakana->katakana, 2 katakana->hirakana,3 both\n");
+ fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
fprintf(stderr,"v Show this usage. V: show version\n");
fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
fprintf(stderr," --fj,--unix,--mac,--windows convert for the system\n");
fprintf(stderr," --jis,--euc,--sjis,--utf8,--utf16,--mime,--base64 convert for the code\n");
fprintf(stderr," --hiragana, --katakana Hiragana/Katakana Conversion\n");
- fprintf(stderr," --cp932, --no-cp932 CP932 compatible\n");
+ fprintf(stderr," --x0212 Convert JISX0212\n");
+ fprintf(stderr," --cp932, --no-cp932 CP932 compatibility\n");
+ fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
#ifdef INPUT_OPTION
fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
#endif
#ifdef NUMCHAR_OPTION
fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
#endif
+#ifdef UNICODE_NORMALIZATION
+ fprintf(stderr," --utf8mac-input UTF-8-MAC input\n");
+#endif
#ifdef UTF8_OUTPUT_ENABLE
fprintf(stderr," --ms-ucs-map Microsoft UCS Mapping Compatible\n");
#endif
,NKF_VERSION,NKF_RELEASE_DATE);
fprintf(stderr,"\n%s\n",CopyRight);
}
-#endif
+#endif /*PERL_XS*/
/**
** \e$B%Q%C%A@):n<T\e(B