** E-Mail: furukawa@tcp-ip.or.jp
** \e$B$^$G8fO"Mm$r$*4j$$$7$^$9!#\e(B
***********************************************************************/
+/* $Id: nkf.c,v 1.37 2004/11/08 14:15:14 naruse Exp $ */
+#define NKF_VERSION "2.0.4"
+#define NKF_RELEASE_DATE "2004-11-08"
#include "config.h"
static char *CopyRight =
"Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW, 2002-2004 Kono, Furukawa";
-static char *Version =
- "2.0";
-static char *Patchlevel =
- "4/0401/Shinji Kono";
+
/*
**
#define UTF8 12
#define UTF8_INPUT 13
-#define UTF16_INPUT 14
+#define UTF16LE_INPUT 14
#define UTF16BE_INPUT 15
#define WISH_TRUE 15
#endif
static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
#ifdef UTF8_OUTPUT_ENABLE
-static int w_oconv16_begin_f= 0; /* utf-16 header */
+static int unicode_bom_f= 0; /* Output Unicode BOM */
static int w_oconv16_LE = 0; /* utf-16 little endian */
+static int ms_ucs_map_f = FALSE; /* Microsoft UCS Mapping Compatible */
#endif
#ifdef UTF8_INPUT_ENABLE
STATIC void w_status PROTO((struct input_code *, int));
STATIC void w16_status PROTO((struct input_code *, int));
-static int utf16_mode = UTF16_INPUT;
+static int utf16_mode = UTF16LE_INPUT;
#endif
struct input_code input_code_list[] = {
static int fold_len = 0;
/* options */
-static unsigned char kanji_intro = DEFAULT_J,
- ascii_intro = DEFAULT_R;
+static unsigned char kanji_intro = DEFAULT_J;
+static unsigned char ascii_intro = DEFAULT_R;
/* Folding */
#ifdef UTF8_OUTPUT_ENABLE
{"utf8", "w"},
{"utf16", "w16"},
+ {"ms-ucs-map", ""},
#endif
#ifdef UTF8_INPUT_ENABLE
{"utf8-input", "W"},
{"prefix=", ""},
};
-static int option_mode;
+static int option_mode = 0;
void
options(cp)
return;
}
#endif
+#ifdef UTF8_OUTPUT_ENABLE
+ if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
+ ms_ucs_map_f = TRUE;
+ continue;
+ }
+#endif
if (strcmp(long_option[i].name, "prefix=") == 0){
if (*p == '=' && ' ' < p[1] && p[1] < 128){
for (i = 2; ' ' < p[i] && p[i] < 128; i++){
if ('1'== cp[0] && '6'==cp[1]) {
output_conv = w_oconv16; cp+=2;
if (cp[0]=='L') {
- w_oconv16_begin_f=2; cp++;
+ unicode_bom_f=2; cp++;
w_oconv16_LE = 1;
if (cp[0] == '0'){
- w_oconv16_begin_f=1; cp++;
+ unicode_bom_f=1; cp++;
}
} else if (cp[0] == 'B') {
- w_oconv16_begin_f=2; cp++;
+ unicode_bom_f=2; cp++;
if (cp[0] == '0'){
- w_oconv16_begin_f=1; cp++;
+ unicode_bom_f=1; cp++;
}
- }
+ }
+ } else if (cp[0] == '8') {
+ output_conv = w_oconv; cp++;
+ unicode_bom_f=2;
+ if (cp[0] == '0'){
+ unicode_bom_f=1; cp++;
+ }
} else
output_conv = w_oconv;
continue;
#ifdef UTF8_INPUT_ENABLE
case 'W': /* UTF-8 input */
if ('1'== cp[0] && '6'==cp[1]) {
- input_f = UTF16_INPUT;
+ input_f = UTF16LE_INPUT;
+ if (cp[0]=='L') {
+ cp++;
+ } else if (cp[0] == 'B') {
+ cp++;
+ input_f = UTF16BE_INPUT;
+ }
+ } else if (cp[0] == '8') {
+ cp++;
+ input_f = UTF8_INPUT;
} else
input_f = UTF8_INPUT;
continue;
#ifdef UTF8_INPUT_ENABLE
} else if (input_f == UTF8_INPUT) {
set_iconv(-TRUE, w_iconv);
- } else if (input_f == UTF16_INPUT) {
+ } else if (input_f == UTF16LE_INPUT) {
set_iconv(-TRUE, w_iconv16);
#endif
} else {
int ret;
if (c2==0376 && c1==0377){
- utf16_mode = UTF16_INPUT;
+ utf16_mode = UTF16LE_INPUT;
return 0;
} else if (c2==0377 && c1==0376){
utf16_mode = UTF16BE_INPUT;
{
extern unsigned short euc_to_utf8_1byte[];
extern unsigned short * euc_to_utf8_2bytes[];
+ extern unsigned short * euc_to_utf8_2bytes_ms[];
unsigned short *p;
if (c2 == X0201) {
c2 &= 0x7f;
c2 = (c2&0x7f) - 0x21;
if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
- p = euc_to_utf8_2bytes[c2];
+ p = ms_ucs_map_f ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
else
return 0;
}
if (c2 == EOF) {
(*o_putc)(EOF);
return;
- } else if (c2 == 0) {
+ }
+
+ if (unicode_bom_f==2) {
+ (*o_putc)('\357');
+ (*o_putc)('\273');
+ (*o_putc)('\277');
+ unicode_bom_f=1;
+ }
+
+ if (c2 == 0) {
output_mode = ASCII;
(*o_putc)(c1);
} else if (c2 == ISO8859_1) {
return;
}
- if (w_oconv16_begin_f==2) {
+ if (unicode_bom_f==2) {
if (w_oconv16_LE){
(*o_putc)((unsigned char)'\377');
(*o_putc)('\376');
(*o_putc)('\376');
(*o_putc)((unsigned char)'\377');
}
- w_oconv16_begin_f=1;
+ unicode_bom_f=1;
}
if (c2 == ISO8859_1) {
void
reinit()
{
+ {
+ struct input_code *p = input_code_list;
+ while (p->name){
+ status_reinit(p++);
+ }
+ }
unbuf_f = FALSE;
estab_f = FALSE;
nop_f = FALSE;
- binmode_f = TRUE;
- rot_f = FALSE;
- hira_f = FALSE;
- input_f = FALSE;
- alpha_f = FALSE;
- mime_f = STRICT_MIME;
- mimebuf_f = FALSE;
- broken_f = FALSE;
- iso8859_f = FALSE;
-#if defined(MSDOS) || defined(__OS2__)
- x0201_f = TRUE;
+ binmode_f = TRUE;
+ rot_f = FALSE;
+ hira_f = FALSE;
+ input_f = FALSE;
+ alpha_f = FALSE;
+ mime_f = STRICT_MIME;
+ mimebuf_f = FALSE;
+ broken_f = FALSE;
+ iso8859_f = FALSE;
+ mimeout_f = FALSE;
+#if defined(MSDOS) || defined(__OS2__)
+ x0201_f = TRUE;
#else
x0201_f = NO_X0201;
#endif
iso2022jp_f = FALSE;
-
+#ifdef UTF8_OUTPUT_ENABLE
+ unicode_bom_f = 0;
+ w_oconv16_LE = 0;
+ ms_ucs_map_f = FALSE;
+#endif
+#ifdef INPUT_OPTION
+ cap_f = FALSE;
+ url_f = FALSE;
+ numchar_f = FALSE;
+#endif
+#ifdef CHECK_OPTION
+ noout_f = FALSE;
+ debug_f = FALSE;
+#endif
+ guess_f = FALSE;
+ is_inputcode_mixed = FALSE;
+ is_inputcode_set = FALSE;
+#ifdef EXEC_IO
+ exec_f = 0;
+#endif
+#ifdef SHIFTJIS_CP932
+ cp932_f = TRUE;
+ cp932inv_f = FALSE;
+#endif
+ {
+ int i;
+ for (i = 0; i < 256; i++){
+ prefix_table[i] = 0;
+ }
+ }
+#ifdef UTF8_INPUT_ENABLE
+ utf16_mode = UTF16LE_INPUT;
+#endif
+ mimeout_mode = 0;
+ base64_count = 0;
+ f_line = 0;
+ f_prev = 0;
+ fold_preserve_f = FALSE;
+ fold_f = FALSE;
+ fold_len = 0;
kanji_intro = DEFAULT_J;
ascii_intro = DEFAULT_R;
-
- output_conv = DEFAULT_CONV;
- oconv = DEFAULT_CONV;
-
- i_mgetc = std_getc;
- i_mungetc = std_ungetc;
- i_mgetc_buf = std_getc;
- i_mungetc_buf = std_ungetc;
-
- i_getc= std_getc;
- i_ungetc=std_ungetc;
-
- i_bgetc= std_getc;
- i_bungetc= std_ungetc;
-
- o_putc = std_putc;
- o_mputc = std_putc;
- o_crconv = no_connection;
- o_rot_conv = no_connection;
- o_iso2022jp_check_conv = no_connection;
- o_hira_conv = no_connection;
- o_fconv = no_connection;
+ fold_margin = FOLD_MARGIN;
+ output_conv = DEFAULT_CONV;
+ oconv = DEFAULT_CONV;
o_zconv = no_connection;
-
+ o_fconv = no_connection;
+ o_crconv = no_connection;
+ o_rot_conv = no_connection;
+ o_hira_conv = no_connection;
+ o_base64conv = no_connection;
+ o_iso2022jp_check_conv = no_connection;
+ o_putc = std_putc;
i_getc = std_getc;
i_ungetc = std_ungetc;
- i_mgetc = std_getc;
- i_mungetc = std_ungetc;
-
+ i_bgetc = std_getc;
+ i_bungetc = std_ungetc;
+ o_mputc = std_putc;
+ i_mgetc = std_getc;
+ i_mungetc = std_ungetc;
+ i_mgetc_buf = std_getc;
+ i_mungetc_buf = std_ungetc;
output_mode = ASCII;
input_mode = ASCII;
shift_mode = FALSE;
- mime_decode_mode = FALSE;
+ mime_decode_mode = FALSE;
file_out = FALSE;
- mimeout_mode = 0;
- mimeout_f = FALSE;
- base64_count = 0;
- option_mode = 0;
crmode_f = 0;
-
- {
- struct input_code *p = input_code_list;
- while (p->name){
- status_reinit(p++);
- }
- }
-#ifdef UTF8_OUTPUT_ENABLE
- if (w_oconv16_begin_f) {
- w_oconv16_begin_f = 2;
- }
-#endif
- f_line = 0;
- f_prev = 0;
- fold_preserve_f = FALSE;
- fold_f = FALSE;
- fold_len = 0;
- fold_margin = FOLD_MARGIN;
+ option_mode = 0;
broken_counter = 0;
broken_last = 0;
z_prev2=0,z_prev1=0;
- {
- int i;
- for (i = 0; i < 256; i++){
- prefix_table[i] = 0;
- }
- }
- input_codename = "";
- is_inputcode_mixed = FALSE;
- is_inputcode_set = FALSE;
}
#endif
#ifdef DEFAULT_CODE_UTF8
fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC), UTF-8 (DEFAULT)\n");
#endif
+#ifdef UTF8_OUTPUT_ENABLE
+ fprintf(stderr," After 'w' you can add more options. (80?|16((B|L)0?)?) \n");
+#endif
fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, AT&T JIS (EUC), UTF-8\n");
+#ifdef UTF8_INPUT_ENABLE
+ fprintf(stderr," After 'W' you can add more options. (8|16(B|L)?) \n");
+#endif
fprintf(stderr,"t no conversion\n");
fprintf(stderr,"i_/o_ Output sequence to designate JIS-kanji/ASCII (DEFAULT B)\n");
fprintf(stderr,"r {de/en}crypt ROT13/47\n");
fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n");
fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n");
fprintf(stderr,"long name options\n");
- fprintf(stderr," --fj,--unix,--mac,--windows convert for the system\n");
+ fprintf(stderr," --fj,--unix,--mac,--windows convert for the system\n");
fprintf(stderr," --jis,--euc,--sjis,--utf8,--utf16,--mime,--base64 convert for the code\n");
+ fprintf(stderr," --hiragana, --katakana Hiragana/Katakana Conversion\n");
+#ifdef INPUT_OPTION
+ fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%'\n");
+#endif
+#ifdef NUMCHAR_OPTION
+ fprintf(stderr," --numchar-input Convert Unicode Character Reference\n");
+#endif
+#ifdef SHIFTJIS_CP932
+ fprintf(stderr," --no-cp932 Don't convert Shift_JIS FAxx-FCxx to equivalnet CP932\n");
+ fprintf(stderr," --cp932inv convert Shift_JIS EDxx-EFxx to equivalnet CP932 FAxx-FCxx\n");
+#endif
+#ifdef UTF8_OUTPUT_ENABLE
+ fprintf(stderr," --ms-ucs-map Microsoft UCS Mapping Compatible\n");
+#endif
#ifdef OVERWRITE
- fprintf(stderr," --overwrite Overwrite original listed files by filtered result\n");
+ fprintf(stderr," --overwrite Overwrite original listed files by filtered result\n");
#endif
- fprintf(stderr," -g, --guess Guess the input code\n");
+ fprintf(stderr," -g, --guess Guess the input code\n");
fprintf(stderr," --help,--version\n");
version();
}
#ifdef __OS2__
"for OS/2"
#endif
- ,Version,Patchlevel);
+ ,NKF_VERSION,NKF_RELEASE_DATE);
fprintf(stderr,"\n%s\n",CopyRight);
}
#endif