** E-Mail: furukawa@tcp-ip.or.jp
** \e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#\e(B
***********************************************************************/
-/* $Id: nkf.c,v 1.67 2005/04/09 21:06:20 rei_furukawa Exp $ */
+/* $Id: nkf.c,v 1.72 2005/07/10 04:36:50 naruse Exp $ */
#define NKF_VERSION "2.0.5"
-#define NKF_RELEASE_DATE "2005-04-10"
+#define NKF_RELEASE_DATE "2005-07-10"
#include "config.h"
static char *CopyRight =
#ifdef PERL_XS
#undef OVERWRITE
#endif
+#if defined( UTF8_OUTPUT_ENABLE ) || defined( UTF8_INPUT_ENABLE )
+#define UNICODE_ENABLE
+#else
+#undef UNICODE_NORMALIZATION
+#endif
#ifndef PERL_XS
#include <stdio.h>
#define UTF8 12
#define UTF8_INPUT 13
-#define UTF16LE_INPUT 14
-#define UTF16BE_INPUT 15
+#define UTF16BE_INPUT 14
+#define UTF16LE_INPUT 15
#define WISH_TRUE 15
#define GETA2 0x2e
-#if defined( UTF8_OUTPUT_ENABLE ) || defined( UTF8_INPUT_ENABLE )
+#ifdef UNICODE_ENABLE
#define sizeof_euc_utf8 94
#define sizeof_euc_to_utf8_1byte 94
#define sizeof_euc_to_utf8_2bytes 94
static unsigned int mime_top = 0;
static unsigned int mime_last = 0; /* decoded */
static unsigned int mime_input = 0; /* undecoded */
+static int (*mime_iconv_back)PROTO((int c2,int c1,int c0)) = NULL;
/* flags */
static int unbuf_f = FALSE;
static int x0201_f = NO_X0201; /* Assume NO JISX0201 */
#endif
static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
+#ifdef UNICODE_ENABLE
+static int internal_unicode_f = FALSE; /* Internal Unicode Processing */
+#endif
#ifdef UTF8_OUTPUT_ENABLE
static int unicode_bom_f= 0; /* Output Unicode BOM */
static int w_oconv16_LE = 0; /* utf-16 little endian */
static int ms_ucs_map_f = FALSE; /* Microsoft UCS Mapping Compatible */
#endif
-
-#ifdef NUMCHAR_OPTION
-
-#define CLASS_MASK 0x0f000000
-#define CLASS_UTF16 0x01000000
+#ifdef UNICODE_NORMALIZATION
+static int nfc_f = FALSE;
+static int (*i_nfc_getc)PROTO((FILE *)) = std_getc; /* input of ugetc */
+static int (*i_nfc_ungetc)PROTO((int c ,FILE *f)) = std_ungetc;
+STATIC int nfc_getc PROTO((FILE *f));
+STATIC int nfc_ungetc PROTO((int c,FILE *f));
#endif
#ifdef INPUT_OPTION
static int (*i_uungetc)PROTO((int c ,FILE *f)) = std_ungetc;
STATIC int url_getc PROTO((FILE *f));
STATIC int url_ungetc PROTO((int c,FILE *f));
+#endif
+#ifdef NUMCHAR_OPTION
+#define CLASS_MASK 0x0f000000
+#define CLASS_UTF16 0x01000000
static int numchar_f = FALSE;
static int (*i_ngetc)PROTO((FILE *)) = std_getc; /* input of ugetc */
static int (*i_nungetc)PROTO((int c ,FILE *f)) = std_ungetc;
STATIC void no_putc PROTO((int c));
static int debug_f = FALSE;
STATIC void debug PROTO((char *str));
+static int (*iconv_for_check)() = 0;
#endif
static int guess_f = FALSE;
#ifdef UTF8_INPUT_ENABLE
STATIC void w_status PROTO((struct input_code *, int));
STATIC void w16_status PROTO((struct input_code *, int));
-static int utf16_mode = UTF16LE_INPUT;
+static int utf16_mode = UTF16BE_INPUT;
#endif
struct input_code input_code_list[] = {
is_inputcode_mixed = FALSE;
is_inputcode_set = FALSE;
input_codename = "";
+#ifdef CHECK_OPTION
+ iconv_for_check = 0;
+#endif
if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
perror(*--argv);
return(-1);
#ifdef X0212_ENABLE
{"x0212", ""},
#endif
+#ifdef UNICODE_ENABLE
+ {"internal-unicode", ""},
+#endif
#ifdef UTF8_OUTPUT_ENABLE
{"utf8", "w"},
{"utf16", "w16"},
{"utf8-input", "W"},
{"utf16-input", "W16"},
#endif
+#ifdef UNICODE_NORMALIZATION
+ {"utf8mac-input", ""},
+#endif
#ifdef OVERWRITE
{"overwrite", ""},
#endif
if (option_mode==1)
return;
- if (*cp++ != '-')
- return;
+ while(*cp && *cp++!='-');
while (*cp) {
- if (p && !*cp) {
- cp = p;
- p = 0;
- }
+ p = 0;
switch (*cp++) {
case '-': /* literal options */
if (!*cp) { /* ignore the rest of arguments */
for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
int j;
p = (unsigned char *)long_option[i].name;
- for (j=0;*p && (*p != '=') && *p == cp[j];p++, j++);
- if (*p == cp[j]){
+ for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
+ if (*p == cp[j] || cp[j] == ' '){
p = &cp[j];
break;
}
if (p == 0) return;
cp = (unsigned char *)long_option[i].alias;
if (!*cp){
+ cp = p;
#ifdef OVERWRITE
if (strcmp(long_option[i].name, "overwrite") == 0){
file_out = TRUE;
return;
}
#endif
+#ifdef UNICODE_ENABLE
+ if (strcmp(long_option[i].name, "internal-unicode") == 0){
+ internal_unicode_f = TRUE;
+ continue;
+ }
+#endif
#ifdef UTF8_OUTPUT_ENABLE
if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
ms_ucs_map_f = TRUE;
continue;
}
#endif
+#ifdef UNICODE_NORMALIZATION
+ if (strcmp(long_option[i].name, "utf8mac-input") == 0){
+ input_f = UTF8_INPUT;
+ nfc_f = TRUE;
+ continue;
+ }
+#endif
if (strcmp(long_option[i].name, "prefix=") == 0){
if (*p == '=' && ' ' < p[1] && p[1] < 128){
for (i = 2; ' ' < p[i] && p[i] < 128; i++){
continue;
case 'h':
/*
- bit:1 hira -> kata
- bit:2 kata -> hira
+ bit:1 katakana->hiragana
+ bit:2 hiragana->katakana
*/
if ('9'>= *cp && *cp>='0')
hira_f |= (*cp++ -'0');
#ifdef UTF8_INPUT_ENABLE
case 'W': /* UTF-8 input */
if ('1'== cp[0] && '6'==cp[1]) {
- input_f = UTF16LE_INPUT;
+ input_f = UTF16BE_INPUT;
+ utf16_mode = UTF16BE_INPUT;
+ cp += 2;
if (cp[0]=='L') {
cp++;
+ input_f = UTF16LE_INPUT;
+ utf16_mode = UTF16LE_INPUT;
} else if (cp[0] == 'B') {
cp++;
input_f = UTF16BE_INPUT;
+ utf16_mode = UTF16BE_INPUT;
}
} else if (cp[0] == '8') {
cp++;
continue;
case ' ':
/* module muliple options in a string are allowed for Perl moudle */
- while(*cp && *cp!='-') cp++;
- if(*cp=='-') cp++;
+ while(*cp && *cp++!='-');
continue;
default:
/* bogus option but ignored */
return 0;
}
-#ifdef CHECK_OPTION
-static int (*iconv_for_check)() = 0;
-#endif
-
#ifdef ANSI_C_PROTOTYPE
void set_iconv(int f, int (*iconv_func)(int c2,int c1,int c0))
#else
i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
}
#endif
+#ifdef UNICODE_NORMALIZATION
+ if (nfc_f && input_f == UTF8_INPUT){
+ i_nfc_getc = i_getc; i_getc = nfc_getc;
+ i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
+ }
+#endif
if (mime_f && mimebuf_f==FIXED_MIME) {
i_mgetc = i_getc; i_getc = mime_getc;
i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
#ifdef UTF8_INPUT_ENABLE
} else if (input_f == UTF8_INPUT) {
set_iconv(-TRUE, w_iconv);
+ } else if (input_f == UTF16BE_INPUT) {
+ set_iconv(-TRUE, w_iconv16);
} else if (input_f == UTF16LE_INPUT) {
set_iconv(-TRUE, w_iconv16);
#endif
int c2,
c1, c0;
{
- int ret = w2e_conv(c2, c1, c0, &c2, &c1);
+ int ret = 0;
+ unsigned short val = 0;
+
+ if (c0 == 0){
+ if (c2 < 0x80 || (c2 & 0xc0) == 0xdf) /* 0x00-0x7f 0xc0-0xdf */
+ ; /* 1 or 2ytes */
+ else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */
+ return -1; /* 3bytes */
+#ifdef __COMMENT__
+ else if (0xf0 <= c2)
+ return 0; /* 4,5,6bytes */
+ else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */
+ return 0; /* trail byte */
+#endif
+ else return 0;
+ }
+ if (c2 == EOF);
+ else if (c2 == 0xef && c1 == 0xbb && c0 == 0xbf)
+ return 0; /* throw BOM */
+ else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){
+ val = ww16_conv(c2, c1, c0);
+ c2 = (val >> 8) & 0xff;
+ c1 = val & 0xff;
+ } else {
+ ret = w2e_conv(c2, c1, c0, &c2, &c1);
+ }
if (ret == 0){
(*oconv)(c2, c1);
}
w_iconv16(c2, c1, c0)
int c2, c1,c0;
{
- int ret;
+ int ret = 0;
if (c2==0376 && c1==0377){
- utf16_mode = UTF16LE_INPUT;
+ utf16_mode = UTF16BE_INPUT;
return 0;
} else if (c2==0377 && c1==0376){
- utf16_mode = UTF16BE_INPUT;
+ utf16_mode = UTF16LE_INPUT;
return 0;
}
- if (c2 != EOF && utf16_mode == UTF16BE_INPUT) {
+ if (c2 != EOF && utf16_mode == UTF16LE_INPUT) {
int tmp;
tmp=c1; c1=c2; c2=tmp;
}
(*oconv)(c2, c1);
return 0;
}
- ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
+ if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16));
+ else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1);
if (ret) return ret;
(*oconv)(c2, c1);
return 0;
c1;
{
int c0;
+ unsigned short val;
if (c2 == EOF) {
(*o_putc)(EOF);
return;
output_mode = ISO8859_1;
(*o_putc)(c1 | 0x080);
} else {
- unsigned short val;
output_mode = UTF8;
- val = e2w_conv(c2, c1);
+ if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16))
+ val = ((c2<<8)&0xff00) + c1;
+ else val = e2w_conv(c2, c1);
if (val){
w16w_conv(val, &c2, &c1, &c0);
(*o_putc)(c2);
unicode_bom_f=1;
}
- if (c2 == ISO8859_1) {
+ if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)){
+ } else if (c2 == ISO8859_1) {
c2 = 0;
c1 |= 0x80;
#ifdef NUMCHAR_OPTION
}
i_getc = i_mgetc;
i_ungetc = i_mungetc;
+ if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
+ mime_iconv_back = NULL;
}
int
}
mime_decode_mode = p[i-2];
+ mime_iconv_back = iconv;
set_iconv(FALSE, mime_priority_func[j]);
clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
}
#endif
+#ifdef UNICODE_NORMALIZATION
+
+/* Normalization Form C */
+int
+nfc_getc(f)
+ FILE *f;
+{
+ int (*g)() = i_nfc_getc;
+ int (*u)() = i_nfc_ungetc;
+ int i=0, j, k=1, lower, upper;
+ int buf[9];
+ int *array = NULL;
+ extern struct normalization_pair normalization_table[];
+
+ buf[i] = (*g)(f);
+ while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
+ lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
+ while (upper >= lower) {
+ j = (lower+upper) / 2;
+ array = normalization_table[j].nfd;
+ for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
+ if (array[k] != buf[k]){
+ array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
+ k = 0;
+ break;
+ } else if (k >= i)
+ buf[++i] = (*g)(f);
+ }
+ if (k > 0){
+ array = normalization_table[j].nfc;
+ for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
+ buf[i] = array[i];
+ i--;
+ break;
+ }
+ }
+ while (i > 0)
+ (*u)(buf[i--], f);
+ }
+ return buf[0];
+}
+
+int
+nfc_ungetc(c, f)
+ int c;
+ FILE *f;
+{
+ return (*i_nfc_ungetc)(c, f);
+}
+#endif /* UNICODE_NORMALIZATION */
+
int
mime_getc(f)
if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
restart_mime_q:
if (c1=='_') return ' ';
+ if (c1<=' ' || DEL<=c1) {
+ mime_decode_mode = FALSE; /* quit */
+ unswitch_mime_getc();
+ return c1;
+ }
if (c1!='=' && c1!='?') {
return c1;
}
mime_decode_mode = exit_mode; /* prepare for quit */
- if (c1<=' ') return c1;
if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
/* end Q encoding */
x0201_f = NO_X0201;
#endif
iso2022jp_f = FALSE;
+#ifdef UNICODE_ENABLE
+ internal_unicode_f = TRUE;
+#endif
#ifdef UTF8_OUTPUT_ENABLE
unicode_bom_f = 0;
w_oconv16_LE = 0;
ms_ucs_map_f = FALSE;
#endif
+#ifdef UNICODE_NORMALIZATION
+ nfc_f = FALSE;
+#endif
#ifdef INPUT_OPTION
cap_f = FALSE;
url_f = FALSE;
}
}
#ifdef UTF8_INPUT_ENABLE
- utf16_mode = UTF16LE_INPUT;
+ utf16_mode = UTF16BE_INPUT;
#endif
mimeout_buf_count = 0;
mimeout_mode = 0;
fprintf(stderr,"Flags:\n");
fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
#ifdef DEFAULT_CODE_SJIS
- fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC), UTF-8\n");
+ fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC), UTF-8N\n");
#endif
#ifdef DEFAULT_CODE_JIS
- fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC), UTF-8\n");
+ fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC), UTF-8N\n");
#endif
#ifdef DEFAULT_CODE_EUC
- fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT), UTF-8\n");
+ fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT), UTF-8N\n");
#endif
#ifdef DEFAULT_CODE_UTF8
- fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC), UTF-8 (DEFAULT)\n");
+ fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC), UTF-8N (DEFAULT)\n");
#endif
#ifdef UTF8_OUTPUT_ENABLE
fprintf(stderr," After 'w' you can add more options. (80?|16((B|L)0?)?) \n");
fprintf(stderr,"t no conversion\n");
fprintf(stderr,"i_/o_ Output sequence to designate JIS-kanji/ASCII (DEFAULT B)\n");
fprintf(stderr,"r {de/en}crypt ROT13/47\n");
- fprintf(stderr,"h 1 hirakana->katakana, 2 katakana->hirakana,3 both\n");
+ fprintf(stderr,"h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n");
fprintf(stderr,"v Show this usage. V: show version\n");
fprintf(stderr,"m[BQN0] MIME decode [B:base64,Q:quoted,N:non-strict,0:no decode]\n");
fprintf(stderr,"M[BQ] MIME encode [B:base64 Q:quoted]\n");
fprintf(stderr," --hiragana, --katakana Hiragana/Katakana Conversion\n");
fprintf(stderr," --x0212 Convert JISX0212\n");
fprintf(stderr," --cp932, --no-cp932 CP932 compatibility\n");
+ fprintf(stderr," --prefix= Insert escape before troublesome characters of Shift_JIS\n");
#ifdef INPUT_OPTION
fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%%'\n");
#endif
#ifdef NUMCHAR_OPTION
fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
#endif
+#ifdef UNICODE_NORMALIZATION
+ fprintf(stderr," --utf8mac-input UTF-8-MAC input\n");
+#endif
#ifdef UTF8_OUTPUT_ENABLE
fprintf(stderr," --ms-ucs-map Microsoft UCS Mapping Compatible\n");
#endif