#include "config.h"
static char *CopyRight =
- "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW, 2002-2003 Kono, Furukawa";
+ "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW, 2002-2004 Kono, Furukawa";
static char *Version =
"2.0";
static char *Patchlevel =
- "3/0310/Shinji Kono";
+ "4/0401/Shinji Kono";
/*
**
** USAGE: nkf [flags] [file]
**
** Flags:
-** b Output is bufferred (DEFAULT)
-** u Output is unbufferred
+** b Output is buffered (DEFAULT)
+** u Output is unbuffered
**
** t no operation
**
#include <stdio.h>
#endif
-#include <stdlib.h>
-#include <string.h>
-
#if defined(MSDOS) || defined(__OS2__)
#include <fcntl.h>
#include <io.h>
#ifdef OVERWRITE
/* added by satoru@isoternet.org */
+#include <stdlib.h>
+#include <string.h>
#include <sys/stat.h>
#ifndef MSDOS /* UNIX, OS/2 */
#include <unistd.h>
#define UTF8 12
#define UTF8_INPUT 13
-#define UTF16_INPUT 14
+#define UTF16LE_INPUT 14
#define UTF16BE_INPUT 15
#define WISH_TRUE 15
#endif
static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */
#ifdef UTF8_OUTPUT_ENABLE
-static int w_oconv16_begin_f= 0; /* utf-16 header */
+static int unicode_bom_f= 0; /* Output Unicode BOM */
static int w_oconv16_LE = 0; /* utf-16 little endian */
+static int ms_ucs_map_f = FALSE; /* Microsoft UCS Mapping Compatible */
#endif
STATIC void debug PROTO((char *str));
#endif
+static int guess_f = FALSE;
+STATIC void print_guessed_code PROTO((char *filename));
+STATIC void set_input_codename PROTO((char *codename));
+static int is_inputcode_mixed = FALSE;
+static int is_inputcode_set = FALSE;
+
#ifdef EXEC_IO
static int exec_f = 0;
#endif
#define CP932_TABLE_BEGIN (0xfa)
#define CP932_TABLE_END (0xfc)
+STATIC int cp932inv_f = FALSE;
+#define CP932INV_TABLE_BEGIN (0xed)
+#define CP932INV_TABLE_END (0xee)
+
#endif /* SHIFTJIS_CP932 */
+STATIC unsigned char prefix_table[256];
+
STATIC void e_status PROTO((struct input_code *, int));
STATIC void s_status PROTO((struct input_code *, int));
#ifdef UTF8_INPUT_ENABLE
STATIC void w_status PROTO((struct input_code *, int));
STATIC void w16_status PROTO((struct input_code *, int));
-static int utf16_mode = UTF16_INPUT;
+static int utf16_mode = UTF16LE_INPUT;
#endif
struct input_code input_code_list[] = {
FILE *fin;
unsigned char *cp;
+ char *outfname;
+ char *origfname;
+
#ifdef EASYWIN /*Easy Win */
_BufferSize.y = 400;/*Set Scroll Buffer Size*/
#endif
setvbuffer(stdin, stdibuf, IOBUF_SIZE);
if (nop_f)
noconvert(stdin);
- else
+ else {
kanji_convert(stdin);
+ if (guess_f) print_guessed_code(NULL);
+ }
} else {
+ int nfiles = argc;
while (argc--) {
- char *outfname;
- char *origfname;
-
if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
perror(*--argv);
return(-1);
setvbuffer(fin, stdibuf, IOBUF_SIZE);
if (nop_f)
noconvert(fin);
- else
+ else {
+ char *filename = NULL;
kanji_convert(fin);
+ if (nfiles > 1) filename = origfname;
+ if (guess_f) print_guessed_code(filename);
+ }
fclose(fin);
#ifdef OVERWRITE
if (overwrite) {
{"hiragana","h1"},
{"katakana","h2"},
{"katakana-hiragana","h3"},
+ {"guess", "g"},
#ifdef UTF8_OUTPUT_ENABLE
{"utf8", "w"},
{"utf16", "w16"},
+ {"ms-ucs-map", ""},
#endif
#ifdef UTF8_INPUT_ENABLE
{"utf8-input", "W"},
#endif
#ifdef SHIFTJIS_CP932
{"no-cp932", ""},
+ {"cp932inv", ""},
#endif
#ifdef EXEC_IO
{"exec-in", ""},
{"exec-out", ""},
#endif
+ {"prefix=", ""},
};
static int option_mode;
for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
int j;
p = (unsigned char *)long_option[i].name;
- for (j=0;*p && *p++ == cp[j];j++);
- if (! *p && !cp[j]) break;
+ for (j=0;*p && (*p != '=') && *p == cp[j];p++, j++);
+ if (*p == cp[j]){
+ p = &cp[j];
+ break;
+ }
+ p = 0;
}
- if (*p) return;
+ if (p == 0) return;
cp = (unsigned char *)long_option[i].alias;
if (!*cp){
#ifdef OVERWRITE
#endif
#ifdef SHIFTJIS_CP932
if (strcmp(long_option[i].name, "no-cp932") == 0){
- cp932_f = TRUE;
+ cp932_f = FALSE;
+ continue;
+ }
+ if (strcmp(long_option[i].name, "cp932inv") == 0){
+ cp932inv_f = TRUE;
continue;
}
#endif
return;
}
#endif
+#ifdef UTF8_OUTPUT_ENABLE
+ if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
+ ms_ucs_map_f = TRUE;
+ continue;
+ }
+#endif
+ if (strcmp(long_option[i].name, "prefix=") == 0){
+ if (*p == '=' && ' ' < p[1] && p[1] < 128){
+ for (i = 2; ' ' < p[i] && p[i] < 128; i++){
+ prefix_table[p[i]] = p[1];
+ }
+ }
+ continue;
+ }
}
continue;
case 'b': /* buffered mode */
if ('1'== cp[0] && '6'==cp[1]) {
output_conv = w_oconv16; cp+=2;
if (cp[0]=='L') {
- w_oconv16_begin_f=2; cp++;
+ unicode_bom_f=2; cp++;
w_oconv16_LE = 1;
if (cp[0] == '0'){
- w_oconv16_begin_f=1; cp++;
+ unicode_bom_f=1; cp++;
}
} else if (cp[0] == 'B') {
- w_oconv16_begin_f=2; cp++;
+ unicode_bom_f=2; cp++;
if (cp[0] == '0'){
- w_oconv16_begin_f=1; cp++;
+ unicode_bom_f=1; cp++;
}
- }
+ }
+ } else if (cp[0] == '8') {
+ output_conv = w_oconv; cp++;
+ unicode_bom_f=2;
+ if (cp[0] == '0'){
+ unicode_bom_f=1; cp++;
+ }
} else
output_conv = w_oconv;
continue;
#ifdef UTF8_INPUT_ENABLE
case 'W': /* UTF-8 input */
if ('1'== cp[0] && '6'==cp[1]) {
- input_f = UTF16_INPUT;
+ input_f = UTF16LE_INPUT;
+ if (cp[0]=='L') {
+ cp++;
+ } else if (cp[0] == 'B') {
+ cp++;
+ input_f = UTF16BE_INPUT;
+ }
} else
input_f = UTF8_INPUT;
continue;
crmode_f = 0; cp++;
}
continue;
+ case 'g':
+#ifndef PERL_XS
+ guess_f = TRUE;
+#endif
+ continue;
case ' ':
/* module muliple options in a string are allowed for Perl moudle */
while(*cp && *cp!='-') cp++;
if (estab_f && iconv_for_check != iconv){
struct input_code *p = find_inputcode_byfunc(iconv);
if (p){
- debug(input_codename = p->name);
+ set_input_codename(p->name);
+ debug(input_codename);
}
iconv_for_check = iconv;
}
#endif
}
-#define SCORE_KANA (1) /* \e$B$$$o$f$kH>3Q%+%J\e(B */
+#define SCORE_L2 (1) /* \e$BBh\e(B2\e$B?e=`4A;z\e(B */
+#define SCORE_KANA (SCORE_L2 << 1) /* \e$B$$$o$f$kH>3Q%+%J\e(B */
#define SCORE_DEPEND (SCORE_KANA << 1) /* \e$B5!<o0MB8J8;z\e(B */
#ifdef SHIFTJIS_CP932
#define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932 \e$B$K$h$kFI$_49$(\e(B */
};
int score_table_F0[] = {
- 0, 0, 0, 0,
- 0, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
+ SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
+ SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
};
int c1 = ptr->buf[1];
if (c2 < 0){
set_code_score(ptr, SCORE_ERROR);
- }else if ((c2 & 0xf0) == 0xa0){
- set_code_score(ptr, score_table_A0[c2 & 0x0f]);
- }else if ((c2 & 0xf0) == 0xf0){
- set_code_score(ptr, score_table_F0[c2 & 0x0f]);
}else if (c2 == SSO){
set_code_score(ptr, SCORE_KANA);
- }
#ifdef UTF8_OUTPUT_ENABLE
- else if (!e2w_conv(c2, c1)){
+ }else if (!e2w_conv(c2, c1)){
set_code_score(ptr, SCORE_NO_EXIST);
- }
#endif
+ }else if ((c2 & 0x70) == 0x20){
+ set_code_score(ptr, score_table_A0[c2 & 0x0f]);
+ }else if ((c2 & 0x70) == 0x70){
+ set_code_score(ptr, score_table_F0[c2 & 0x0f]);
+ }else if ((c2 & 0x70) >= 0x50){
+ set_code_score(ptr, SCORE_L2);
+ }
}
void status_disable(ptr)
struct input_code *ptr;
{
ptr->stat = 0;
- ptr->score &= SCORE_INIT;
ptr->index = 0;
}
if (0x80 <= c && c <= 0xbf){
status_push_ch(ptr, c);
if (ptr->index > ptr->stat){
+ int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
+ && ptr->buf[2] == 0xbf);
w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
&ptr->buf[0], &ptr->buf[1]);
- code_score(ptr);
+ if (!bom){
+ code_score(ptr);
+ }
status_clear(ptr);
}
}else{
/* output redicrection */
#ifdef CHECK_OPTION
- if (noout_f){
+ if (noout_f || guess_f){
o_putc = no_putc;
}
#endif
}
i_getc = std_getc;
+ i_ungetc = std_ungetc;
/* input redicrection */
#ifdef INPUT_OPTION
if (cap_f){
#ifdef UTF8_INPUT_ENABLE
} else if (input_f == UTF8_INPUT) {
set_iconv(-TRUE, w_iconv);
- } else if (input_f == UTF16_INPUT) {
+ } else if (input_f == UTF16LE_INPUT) {
set_iconv(-TRUE, w_iconv16);
#endif
} else {
FILE *f;
{
int c1,
- c2;
+ c2, c3;
module_connection();
c2 = 0;
/* This is kanji introduction */
input_mode = X0208;
shift_mode = FALSE;
- debug(input_codename = "ISO-2022-JP");
+ set_input_codename("ISO-2022-JP");
+ debug(input_codename);
NEXT;
} else if (c1 == '(') {
if ((c1 = (*i_getc)(f)) == EOF) {
}
} else if ( c1 == 'N' || c1 == 'n' ){
/* SS2 */
- c1 = (*i_getc)(f); /* skip SS2 */
- if ( SPACE<=c1 && c1 < 0xe0 ) {
+ c3 = (*i_getc)(f); /* skip SS2 */
+ if ( (SPACE<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
+ c1 = c3;
c2 = X0201;
SEND;
+ }else{
+ (*i_ungetc)(c3, f);
+ /* lonely ESC */
+ (*oconv)(0, ESC);
+ SEND;
}
} else {
/* lonely ESC */
int *p2, *p1;
{
#ifdef SHIFTJIS_CP932
- if (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
+ if (cp932_f && CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END){
extern unsigned short shiftjis_cp932[3][189];
c1 = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
if (c1 == 0) return 1;
}
#ifdef NUMCHAR_OPTION
if (ret){
- c1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
- c2 = 0;
+ if (p2) *p2 = 0;
+ if (p1) *p1 = CLASS_UTF16 | ww16_conv(c2, c1, c0);
ret = 0;
}
#endif
int ret;
if (c2==0376 && c1==0377){
- utf16_mode = UTF16_INPUT;
+ utf16_mode = UTF16LE_INPUT;
return 0;
} else if (c2==0377 && c1==0376){
utf16_mode = UTF16BE_INPUT;
return 0;
}
- if (utf16_mode == UTF16BE_INPUT) {
+ if (c2 != EOF && utf16_mode == UTF16BE_INPUT) {
int tmp;
tmp=c1; c1=c2; c2=tmp;
}
{
extern unsigned short euc_to_utf8_1byte[];
extern unsigned short * euc_to_utf8_2bytes[];
+ extern unsigned short * euc_to_utf8_2bytes_ms[];
unsigned short *p;
if (c2 == X0201) {
c2 &= 0x7f;
c2 = (c2&0x7f) - 0x21;
if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
- p = euc_to_utf8_2bytes[c2];
+ p = ms_ucs_map_f ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2];
else
return 0;
}
if (c2 == EOF) {
(*o_putc)(EOF);
return;
- } else if (c2 == 0) {
+ }
+
+ if (unicode_bom_f==2) {
+ (*o_putc)('\357');
+ (*o_putc)('\273');
+ (*o_putc)('\277');
+ unicode_bom_f=1;
+ }
+
+ if (c2 == 0) {
output_mode = ASCII;
(*o_putc)(c1);
} else if (c2 == ISO8859_1) {
return;
}
- if (w_oconv16_begin_f==2) {
+ if (unicode_bom_f==2) {
if (w_oconv16_LE){
(*o_putc)((unsigned char)'\377');
(*o_putc)('\376');
(*o_putc)('\376');
(*o_putc)((unsigned char)'\377');
}
- w_oconv16_begin_f=1;
+ unicode_bom_f=1;
}
if (c2 == ISO8859_1) {
output_mode = ISO8859_1;
(*o_putc)(c1 | 0x080);
} else {
- if ((c1<0x20 || 0x7e<c1) ||
- (c2<0x20 || 0x7e<c2)) {
+ if ((c1<0x21 || 0x7e<c1) ||
+ (c2<0x21 || 0x7e<c2)) {
set_iconv(FALSE, 0);
return; /* too late to rescue this char */
}
}
output_mode = SHIFT_JIS;
e2s_conv(c2, c1, &c2, &c1);
+
+#ifdef SHIFTJIS_CP932
+ if (cp932inv_f
+ && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
+ extern unsigned short cp932inv[2][189];
+ int c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
+ if (c){
+ c2 = c >> 8;
+ c1 = c & 0xff;
+ }
+ }
+#endif /* SHIFTJIS_CP932 */
+
(*o_putc)(c2);
+ if (prefix_table[(unsigned char)c1]){
+ (*o_putc)(prefix_table[(unsigned char)c1]);
+ }
(*o_putc)(c1);
}
}
(unsigned char *)"\075?ISO-2022-JP?Q?",
#if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
(unsigned char *)"\075?UTF-8?B?",
+ (unsigned char *)"\075?UTF-8?Q?",
#endif
(unsigned char *)"\075?US-ASCII?Q?",
NULL
int (*mime_priority_func[])PROTO((int c2, int c1, int c0)) = {
e_iconv, s_iconv, 0, 0, 0, 0,
#if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
- w_iconv,
+ w_iconv, w_iconv,
#endif
0,
};
int mime_encode[] = {
JAPANESE_EUC, SHIFT_JIS,ISO8859_1, ISO8859_1, X0208, X0201,
#if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
- UTF8,
+ UTF8, UTF8,
#endif
ASCII,
0
int mime_encode_method[] = {
'B', 'B','Q', 'B', 'B', 'Q',
#if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
- 'B',
+ 'B', 'Q',
#endif
'Q',
0
}
#endif
+void
+set_input_codename (codename)
+ char *codename;
+{
+ if (guess_f &&
+ is_inputcode_set &&
+ strcmp(codename, "") != 0 &&
+ strcmp(codename, input_codename) != 0)
+ {
+ is_inputcode_mixed = TRUE;
+ }
+ input_codename = codename;
+ is_inputcode_set = TRUE;
+}
+
+void
+print_guessed_code (filename)
+ char *filename;
+{
+ char *codename = "BINARY";
+ if (!is_inputcode_mixed) {
+ if (strcmp(input_codename, "") == 0) {
+ codename = "ASCII";
+ } else {
+ codename = input_codename;
+ }
+ }
+ if (filename != NULL) printf("%s:", filename);
+ printf("%s\n", codename);
+}
+
int
hex2bin(x)
int x;
}
}
#ifdef UTF8_OUTPUT_ENABLE
- if (w_oconv16_begin_f) {
- w_oconv16_begin_f = 2;
+ if (unicode_bom_f) {
+ unicode_bom_f = 2;
}
#endif
f_line = 0;
broken_last = 0;
z_prev2=0,z_prev1=0;
+ {
+ int i;
+ for (i = 0; i < 256; i++){
+ prefix_table[i] = 0;
+ }
+ }
input_codename = "";
+ is_inputcode_mixed = FALSE;
+ is_inputcode_set = FALSE;
}
#endif
{
fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n");
fprintf(stderr,"Flags:\n");
- fprintf(stderr,"b,u Output is bufferred (DEFAULT),Output is unbufferred\n");
+ fprintf(stderr,"b,u Output is buffered (DEFAULT),Output is unbuffered\n");
#ifdef DEFAULT_CODE_SJIS
fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC), UTF-8\n");
#endif
#ifdef OVERWRITE
fprintf(stderr," --overwrite Overwrite original listed files by filtered result\n");
#endif
+ fprintf(stderr," -g, --guess Guess the input code\n");
fprintf(stderr," --help,--version\n");
version();
}