1 /** Network Kanji Filter. (PDS Version)
2 ************************************************************************
3 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
4 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
5 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
6 ** Copyright (C) 1996,1998
8 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
9 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
10 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
11 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
13 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
14 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
15 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
16 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
17 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
18 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
20 ** Everyone is permitted to do anything on this program
21 ** including copying, modifying, improving,
22 ** as long as you don't try to pretend that you wrote it.
23 ** i.e., the above copyright notice has to appear in all copies.
24 ** Binary distribution requires original version messages.
25 ** You don't have to ask before copying, redistribution or publishing.
26 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
27 ***********************************************************************/
29 /***********************************************************************
30 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
31 * http://sourceforge.jp/projects/nkf/
32 ***********************************************************************/
33 #define NKF_IDENT "$Id: nkf.c,v 1.166 2008/01/23 09:10:25 naruse Exp $"
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2008-01-23"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2008 Kono, Furukawa, Naruse, mastodon"
44 /* state of output_mode and input_mode
122 NKF_ENCODING_TABLE_SIZE,
131 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
132 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
133 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
134 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
135 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
136 void j_oconv(nkf_char c2, nkf_char c1);
137 void s_oconv(nkf_char c2, nkf_char c1);
138 void e_oconv(nkf_char c2, nkf_char c1);
139 void w_oconv(nkf_char c2, nkf_char c1);
140 void w_oconv16(nkf_char c2, nkf_char c1);
141 void w_oconv32(nkf_char c2, nkf_char c1);
145 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
146 void (*oconv)(nkf_char c2, nkf_char c1);
147 } nkf_native_encoding;
149 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
150 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
151 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
152 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
153 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
154 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
155 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
160 const nkf_native_encoding *base_encoding;
163 nkf_encoding nkf_encoding_table[] = {
164 {ASCII, "US-ASCII", &NkfEncodingASCII},
165 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
166 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
167 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
168 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
169 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
170 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
171 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
172 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
173 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
174 {CP10001, "CP10001", &NkfEncodingShift_JIS},
175 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
176 {CP51932, "CP51932", &NkfEncodingEUC_JP},
177 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
178 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
179 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
180 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
181 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
182 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
183 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
184 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
185 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
186 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
187 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
188 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
189 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
190 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
191 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
192 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
193 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
194 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
195 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
196 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
197 {BINARY, "BINARY", &NkfEncodingASCII},
204 } encoding_name_to_id_table[] = {
207 {"ISO-2022-JP", ISO_2022_JP},
208 {"ISO2022JP-CP932", CP50220},
209 {"CP50220", CP50220},
210 {"CP50221", CP50221},
211 {"CP50222", CP50222},
212 {"ISO-2022-JP-1", ISO_2022_JP_1},
213 {"ISO-2022-JP-3", ISO_2022_JP_3},
214 {"SHIFT_JIS", SHIFT_JIS},
216 {"WINDOWS-31J", WINDOWS_31J},
217 {"CSWINDOWS31J", WINDOWS_31J},
218 {"CP932", WINDOWS_31J},
219 {"MS932", WINDOWS_31J},
220 {"CP10001", CP10001},
223 {"CP51932", CP51932},
224 {"EUC-JP-MS", EUCJP_MS},
225 {"EUCJP-MS", EUCJP_MS},
226 {"EUCJPMS", EUCJP_MS},
227 {"EUC-JP-ASCII", EUCJP_ASCII},
228 {"EUCJP-ASCII", EUCJP_ASCII},
229 {"SHIFT_JISX0213", SHIFT_JISX0213},
230 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
231 {"EUC-JISX0213", EUC_JISX0213},
232 {"EUC-JIS-2004", EUC_JIS_2004},
235 {"UTF-8-BOM", UTF_8_BOM},
236 {"UTF8-MAC", UTF8_MAC},
237 {"UTF-8-MAC", UTF8_MAC},
239 {"UTF-16BE", UTF_16BE},
240 {"UTF-16BE-BOM", UTF_16BE_BOM},
241 {"UTF-16LE", UTF_16LE},
242 {"UTF-16LE-BOM", UTF_16LE_BOM},
244 {"UTF-32BE", UTF_32BE},
245 {"UTF-32BE-BOM", UTF_32BE_BOM},
246 {"UTF-32LE", UTF_32LE},
247 {"UTF-32LE-BOM", UTF_32LE_BOM},
252 #if defined(DEFAULT_CODE_JIS)
253 #define DEFAULT_ENCIDX ISO_2022_JP
254 #elif defined(DEFAULT_CODE_SJIS)
255 #define DEFAULT_ENCIDX SHIFT_JIS
256 #elif defined(DEFAULT_CODE_EUC)
257 #define DEFAULT_ENCIDX EUC_JP
258 #elif defined(DEFAULT_CODE_UTF8)
259 #define DEFAULT_ENCIDX UTF_8
263 #define is_alnum(c) \
264 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
266 /* I don't trust portablity of toupper */
267 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
268 #define nkf_isoctal(c) ('0'<=c && c<='7')
269 #define nkf_isdigit(c) ('0'<=c && c<='9')
270 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
271 #define nkf_isblank(c) (c == SP || c == TAB)
272 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
273 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
274 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
275 #define nkf_isprint(c) (SP<=c && c<='~')
276 #define nkf_isgraph(c) ('!'<=c && c<='~')
277 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
278 ('A'<=c&&c<='F') ? (c-'A'+10) : \
279 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
280 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
281 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
282 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
283 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
284 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
286 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
288 #define HOLD_SIZE 1024
289 #if defined(INT_IS_SHORT)
290 #define IOBUF_SIZE 2048
292 #define IOBUF_SIZE 16384
295 #define DEFAULT_J 'B'
296 #define DEFAULT_R 'B'
299 #define RANGE_NUM_MAX 18
304 /* MIME preprocessor */
306 #ifdef EASYWIN /*Easy Win */
307 extern POINT _BufferSize;
316 void (*status_func)(struct input_code *, nkf_char);
317 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
321 static char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
322 static nkf_encoding *input_encoding = NULL;
323 static nkf_encoding *output_encoding = NULL;
324 static void set_output_encoding(nkf_encoding *enc);
326 #if !defined(PERL_XS) && !defined(WIN32DLL)
327 static nkf_char noconvert(FILE *f);
329 static void module_connection(void);
330 static nkf_char kanji_convert(FILE *f);
331 static nkf_char h_conv(FILE *f,nkf_char c2,nkf_char c1);
332 static nkf_char push_hold_buf(nkf_char c2);
333 static void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0));
334 static nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
335 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
337 * 0: Shift_JIS, eucJP-ascii
342 #define UCS_MAP_ASCII 0
344 #define UCS_MAP_CP932 2
345 #define UCS_MAP_CP10001 3
346 static int ms_ucs_map_f = UCS_MAP_ASCII;
348 #ifdef UTF8_INPUT_ENABLE
349 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
350 static int no_cp932ext_f = FALSE;
351 /* ignore ZERO WIDTH NO-BREAK SPACE */
352 static int no_best_fit_chars_f = FALSE;
353 static int input_endian = ENDIAN_BIG;
354 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
355 static void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c);
356 static void encode_fallback_html(nkf_char c);
357 static void encode_fallback_xml(nkf_char c);
358 static void encode_fallback_java(nkf_char c);
359 static void encode_fallback_perl(nkf_char c);
360 static void encode_fallback_subchar(nkf_char c);
361 static void (*encode_fallback)(nkf_char c) = NULL;
362 static nkf_char w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
363 static nkf_char unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char *p2,nkf_char *p1);
364 static nkf_char w_iconv_common(nkf_char c1,nkf_char c0,const unsigned short *const *pp,nkf_char psize,nkf_char *p2,nkf_char *p1);
365 static void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0);
366 static nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0);
367 static nkf_char w16e_conv(nkf_char val,nkf_char *p2,nkf_char *p1);
368 static void w_status(struct input_code *, nkf_char);
370 #ifdef UTF8_OUTPUT_ENABLE
371 static int output_bom_f = FALSE;
372 static int output_endian = ENDIAN_BIG;
373 static nkf_char e2w_conv(nkf_char c2,nkf_char c1);
375 static nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1);
376 static void fold_conv(nkf_char c2,nkf_char c1);
377 static void nl_conv(nkf_char c2,nkf_char c1);
378 static void z_conv(nkf_char c2,nkf_char c1);
379 static void rot_conv(nkf_char c2,nkf_char c1);
380 static void hira_conv(nkf_char c2,nkf_char c1);
381 static void base64_conv(nkf_char c2,nkf_char c1);
382 static void iso2022jp_check_conv(nkf_char c2,nkf_char c1);
383 static void no_connection(nkf_char c2,nkf_char c1);
384 static nkf_char no_connection2(nkf_char c2,nkf_char c1,nkf_char c0);
386 static void code_score(struct input_code *ptr);
387 static void code_status(nkf_char c);
389 static void std_putc(nkf_char c);
390 static nkf_char std_getc(FILE *f);
391 static nkf_char std_ungetc(nkf_char c,FILE *f);
393 static nkf_char broken_getc(FILE *f);
394 static nkf_char broken_ungetc(nkf_char c,FILE *f);
396 static nkf_char mime_begin(FILE *f);
397 static nkf_char mime_getc(FILE *f);
398 static nkf_char mime_ungetc(nkf_char c,FILE *f);
400 static void switch_mime_getc(void);
401 static void unswitch_mime_getc(void);
402 static nkf_char mime_begin_strict(FILE *f);
403 static nkf_char mime_getc_buf(FILE *f);
404 static nkf_char mime_ungetc_buf(nkf_char c,FILE *f);
405 static nkf_char mime_integrity(FILE *f,const unsigned char *p);
407 static nkf_char base64decode(nkf_char c);
408 static void mime_prechar(nkf_char c2, nkf_char c1);
409 static void mime_putc(nkf_char c);
410 static void open_mime(nkf_char c);
411 static void close_mime(void);
412 static void eof_mime(void);
413 static void mimeout_addchar(nkf_char c);
415 static void usage(void);
416 static void show_configuration(void);
418 static void options(unsigned char *c);
419 static void reinit(void);
423 #if !defined(PERL_XS) && !defined(WIN32DLL)
424 static unsigned char stdibuf[IOBUF_SIZE];
425 static unsigned char stdobuf[IOBUF_SIZE];
427 static unsigned char hold_buf[HOLD_SIZE*2];
428 static int hold_count = 0;
430 /* MIME preprocessor fifo */
432 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
433 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
434 #define Fifo(n) mime_buf[(n)&MIME_BUF_MASK]
435 static unsigned char mime_buf[MIME_BUF_SIZE];
436 static unsigned int mime_top = 0;
437 static unsigned int mime_last = 0; /* decoded */
438 static unsigned int mime_input = 0; /* undecoded */
439 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
442 static int unbuf_f = FALSE;
443 static int estab_f = FALSE;
444 static int nop_f = FALSE;
445 static int binmode_f = TRUE; /* binary mode */
446 static int rot_f = FALSE; /* rot14/43 mode */
447 static int hira_f = FALSE; /* hira/kata henkan */
448 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
449 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
450 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
451 static int mimebuf_f = FALSE; /* MIME buffered input */
452 static int broken_f = FALSE; /* convert ESC-less broken JIS */
453 static int iso8859_f = FALSE; /* ISO8859 through */
454 static int mimeout_f = FALSE; /* base64 mode */
455 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
456 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
458 #ifdef UNICODE_NORMALIZATION
459 static int nfc_f = FALSE;
460 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
461 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
462 static nkf_char nfc_getc(FILE *f);
463 static nkf_char nfc_ungetc(nkf_char c,FILE *f);
467 static int cap_f = FALSE;
468 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
469 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
470 static nkf_char cap_getc(FILE *f);
471 static nkf_char cap_ungetc(nkf_char c,FILE *f);
473 static int url_f = FALSE;
474 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
475 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
476 static nkf_char url_getc(FILE *f);
477 static nkf_char url_ungetc(nkf_char c,FILE *f);
480 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
481 #define CLASS_MASK NKF_INT32_C(0xFF000000)
482 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
483 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
484 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
485 #define is_unicode_capsule(c) ((c & CLASS_MASK) == CLASS_UNICODE)
486 #define is_unicode_bmp(c) ((c & VALUE_MASK) <= NKF_INT32_C(0xFFFF))
488 #ifdef NUMCHAR_OPTION
489 static int numchar_f = FALSE;
490 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
491 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
492 static nkf_char numchar_getc(FILE *f);
493 static nkf_char numchar_ungetc(nkf_char c,FILE *f);
497 static int noout_f = FALSE;
498 static void no_putc(nkf_char c);
499 static int debug_f = FALSE;
500 static void debug(const char *str);
501 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
504 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
506 static void print_guessed_code(char *filename);
508 static void set_input_codename(char *codename);
511 static int exec_f = 0;
514 #ifdef SHIFTJIS_CP932
515 /* invert IBM extended characters to others */
516 static int cp51932_f = FALSE;
518 /* invert NEC-selected IBM extended characters to IBM extended characters */
519 static int cp932inv_f = TRUE;
521 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
522 #endif /* SHIFTJIS_CP932 */
525 static int x0212_f = FALSE;
526 static nkf_char x0212_shift(nkf_char c);
527 static nkf_char x0212_unshift(nkf_char c);
529 static int x0213_f = FALSE;
531 static unsigned char prefix_table[256];
533 static void set_code_score(struct input_code *ptr, nkf_char score);
534 static void clr_code_score(struct input_code *ptr, nkf_char score);
535 static void status_disable(struct input_code *ptr);
536 static void status_push_ch(struct input_code *ptr, nkf_char c);
537 static void status_clear(struct input_code *ptr);
538 static void status_reset(struct input_code *ptr);
539 static void status_reinit(struct input_code *ptr);
540 static void status_check(struct input_code *ptr, nkf_char c);
541 static void e_status(struct input_code *, nkf_char);
542 static void s_status(struct input_code *, nkf_char);
544 struct input_code input_code_list[] = {
545 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
546 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
547 #ifdef UTF8_INPUT_ENABLE
548 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
549 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
550 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
555 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
556 static int base64_count = 0;
558 /* X0208 -> ASCII converter */
561 static int f_line = 0; /* chars in line */
562 static int f_prev = 0;
563 static int fold_preserve_f = FALSE; /* preserve new lines */
564 static int fold_f = FALSE;
565 static int fold_len = 0;
568 static unsigned char kanji_intro = DEFAULT_J;
569 static unsigned char ascii_intro = DEFAULT_R;
573 #define FOLD_MARGIN 10
574 #define DEFAULT_FOLD 60
576 static int fold_margin = FOLD_MARGIN;
578 /* process default */
579 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
580 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
582 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
583 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
584 static void (*o_nlconv)(nkf_char c2,nkf_char c1) = no_connection;
585 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
586 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
587 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
588 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
590 /* static redirections */
592 static void (*o_putc)(nkf_char c) = std_putc;
594 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
595 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
597 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
598 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
600 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
602 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
603 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
605 /* for strict mime */
606 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
607 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
610 static int output_mode = ASCII, /* output kanji mode */
611 input_mode = ASCII, /* input kanji mode */
612 shift_mode = FALSE; /* TRUE shift out, or X0201 */
613 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
615 /* X0201 / X0208 conversion tables */
617 /* X0201 kana conversion table */
619 static const unsigned char cv[]= {
620 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
621 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
622 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
623 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
624 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
625 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
626 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
627 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
628 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
629 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
630 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
631 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
632 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
633 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
634 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
635 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
639 /* X0201 kana conversion table for daguten */
641 static const unsigned char dv[]= {
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
646 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
647 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
648 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
649 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
650 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
651 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
652 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
653 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
660 /* X0201 kana conversion table for han-daguten */
662 static const unsigned char ev[]= {
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
664 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
665 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
666 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
667 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
668 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
669 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
670 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
672 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
673 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
674 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
676 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
677 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
682 /* X0208 kigou conversion table */
683 /* 0x8140 - 0x819e */
684 static const unsigned char fv[] = {
686 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
687 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
688 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
689 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
690 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
691 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
692 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
693 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
694 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
695 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
696 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
697 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
702 static int file_out_f = FALSE;
704 static int overwrite_f = FALSE;
705 static int preserve_time_f = FALSE;
706 static int backup_f = FALSE;
707 static char *backup_suffix = "";
708 static char *get_backup_filename(const char *suffix, const char *filename);
711 static int nlmode_f = 0; /* CR, LF, CRLF */
712 static int input_newline = 0; /* 0: unestablished, EOF: MIXED */
713 static nkf_char prev_cr = 0; /* CR or 0 */
714 #ifdef EASYWIN /*Easy Win */
715 static int end_check;
718 #define STD_GC_BUFSIZE (256)
719 nkf_char std_gc_buf[STD_GC_BUFSIZE];
722 char* nkf_strcpy(const char *str)
724 char* result = malloc(strlen(str) + 1);
733 static void nkf_str_upcase(const char *src, char *dest, size_t length)
736 for (; i < length && src[i]; i++) {
737 dest[i] = nkf_toupper(src[i]);
742 static nkf_encoding *nkf_enc_from_index(int idx)
744 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
747 return &nkf_encoding_table[idx];
750 static int nkf_enc_find_index(const char *name)
753 if (*name == 'X' && *(name+1) == '-') name += 2;
754 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
755 if (strcmp(name, encoding_name_to_id_table[i].name) == 0) {
756 return encoding_name_to_id_table[i].id;
762 static nkf_encoding *nkf_enc_find(const char *name)
765 idx = nkf_enc_find_index(name);
766 if (idx < 0) return 0;
767 return nkf_enc_from_index(idx);
770 #define nkf_enc_name(enc) (enc)->name
771 #define nkf_enc_to_index(enc) (enc)->id
772 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
773 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
774 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
775 #define nkf_enc_asciicompat(enc) (\
776 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
777 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
778 #define nkf_enc_unicode_p(enc) (\
779 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
780 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
781 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
782 #define nkf_enc_cp5022x_p(enc) (\
783 nkf_enc_to_index(enc) == CP50220 ||\
784 nkf_enc_to_index(enc) == CP50221 ||\
785 nkf_enc_to_index(enc) == CP50222)
787 #ifndef DEFAULT_ENCIDX
788 static char* nkf_locale_charmap()
790 #ifdef HAVE_LANGINFO_H
791 return nl_langinfo(CODESET);
792 #elif defined(__WIN32__)
793 return sprintf("CP%d", GetACP());
799 static nkf_encoding* nkf_locale_encoding()
801 nkf_encoding *enc = 0;
802 char *encname = nkf_locale_charmap();
804 enc = nkf_enc_find(encname);
805 if (enc < 0) enc = 0;
810 static nkf_encoding* nkf_default_encoding()
812 #ifdef DEFAULT_ENCIDX
813 return nkf_enc_from_index(DEFAULT_ENCIDX);
815 nkf_encoding *enc = nkf_locale_encoding();
816 if (enc <= 0) enc = nkf_enc_from_index(ISO_2022_JP);
822 #include "nkf32dll.c"
823 #elif defined(PERL_XS)
825 int main(int argc, char **argv)
830 char *outfname = NULL;
833 #ifdef EASYWIN /*Easy Win */
834 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
836 setlocale(LC_CTYPE, "");
838 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
839 cp = (unsigned char *)*argv;
844 if (pipe(fds) < 0 || (pid = fork()) < 0){
855 execvp(argv[1], &argv[1]);
872 int debug_f_back = debug_f;
875 int exec_f_back = exec_f;
878 int x0212_f_back = x0212_f;
880 int x0213_f_back = x0213_f;
881 int guess_f_back = guess_f;
883 guess_f = guess_f_back;
886 debug_f = debug_f_back;
889 exec_f = exec_f_back;
892 x0212_f = x0212_f_back;
894 x0213_f = x0213_f_back;
897 if (binmode_f == TRUE)
898 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
899 if (freopen("","wb",stdout) == NULL)
906 setbuf(stdout, (char *) NULL);
908 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
911 if (binmode_f == TRUE)
912 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
913 if (freopen("","rb",stdin) == NULL) return (-1);
917 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
921 kanji_convert(stdin);
922 if (guess_f) print_guessed_code(NULL);
926 int is_argument_error = FALSE;
928 input_codename = NULL;
933 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
935 is_argument_error = TRUE;
943 /* reopen file for stdout */
944 if (file_out_f == TRUE) {
947 outfname = malloc(strlen(origfname)
948 + strlen(".nkftmpXXXXXX")
954 strcpy(outfname, origfname);
958 for (i = strlen(outfname); i; --i){
959 if (outfname[i - 1] == '/'
960 || outfname[i - 1] == '\\'){
966 strcat(outfname, "ntXXXXXX");
968 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
971 strcat(outfname, ".nkftmpXXXXXX");
972 fd = mkstemp(outfname);
975 || (fd_backup = dup(fileno(stdout))) < 0
976 || dup2(fd, fileno(stdout)) < 0
987 outfname = "nkf.out";
990 if(freopen(outfname, "w", stdout) == NULL) {
994 if (binmode_f == TRUE) {
995 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
996 if (freopen("","wb",stdout) == NULL)
1003 if (binmode_f == TRUE)
1004 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
1005 if (freopen("","rb",fin) == NULL)
1010 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
1014 char *filename = NULL;
1016 if (nfiles > 1) filename = origfname;
1017 if (guess_f) print_guessed_code(filename);
1023 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1031 if (dup2(fd_backup, fileno(stdout)) < 0){
1034 if (stat(origfname, &sb)) {
1035 fprintf(stderr, "Can't stat %s\n", origfname);
1037 /*
\e$B%Q!<%_%C%7%g%s$rI|85
\e(B */
1038 if (chmod(outfname, sb.st_mode)) {
1039 fprintf(stderr, "Can't set permission %s\n", outfname);
1042 /*
\e$B%?%$%`%9%?%s%W$rI|85
\e(B */
1043 if(preserve_time_f){
1044 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
1045 tb[0] = tb[1] = sb.st_mtime;
1046 if (utime(outfname, tb)) {
1047 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1050 tb.actime = sb.st_atime;
1051 tb.modtime = sb.st_mtime;
1052 if (utime(outfname, &tb)) {
1053 fprintf(stderr, "Can't set timestamp %s\n", outfname);
1058 char *backup_filename = get_backup_filename(backup_suffix, origfname);
1060 unlink(backup_filename);
1062 if (rename(origfname, backup_filename)) {
1063 perror(backup_filename);
1064 fprintf(stderr, "Can't rename %s to %s\n",
1065 origfname, backup_filename);
1069 if (unlink(origfname)){
1074 if (rename(outfname, origfname)) {
1076 fprintf(stderr, "Can't rename %s to %s\n",
1077 outfname, origfname);
1084 if (is_argument_error)
1087 #ifdef EASYWIN /*Easy Win */
1088 if (file_out_f == FALSE)
1089 scanf("%d",&end_check);
1092 #else /* for Other OS */
1093 if (file_out_f == TRUE)
1095 #endif /*Easy Win */
1098 #endif /* WIN32DLL */
1101 char *get_backup_filename(const char *suffix, const char *filename)
1103 char *backup_filename;
1104 int asterisk_count = 0;
1106 int filename_length = strlen(filename);
1108 for(i = 0; suffix[i]; i++){
1109 if(suffix[i] == '*') asterisk_count++;
1113 backup_filename = malloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1114 if (!backup_filename){
1115 perror("Can't malloc backup filename.");
1119 for(i = 0, j = 0; suffix[i];){
1120 if(suffix[i] == '*'){
1121 backup_filename[j] = '\0';
1122 strncat(backup_filename, filename, filename_length);
1124 j += filename_length;
1126 backup_filename[j++] = suffix[i++];
1129 backup_filename[j] = '\0';
1131 j = strlen(suffix) + filename_length;
1132 backup_filename = malloc( + 1);
1133 strcpy(backup_filename, filename);
1134 strcat(backup_filename, suffix);
1135 backup_filename[j] = '\0';
1137 return backup_filename;
1141 static const struct {
1165 {"katakana-hiragana","h3"},
1173 #ifdef UTF8_OUTPUT_ENABLE
1183 {"fb-subchar=", ""},
1185 #ifdef UTF8_INPUT_ENABLE
1186 {"utf8-input", "W"},
1187 {"utf16-input", "W16"},
1188 {"no-cp932ext", ""},
1189 {"no-best-fit-chars",""},
1191 #ifdef UNICODE_NORMALIZATION
1192 {"utf8mac-input", ""},
1204 #ifdef NUMCHAR_OPTION
1205 {"numchar-input", ""},
1211 #ifdef SHIFTJIS_CP932
1221 static void set_input_encoding(nkf_encoding *enc)
1223 switch (nkf_enc_to_index(enc)) {
1227 #ifdef SHIFTJIS_CP932
1230 #ifdef UTF8_OUTPUT_ENABLE
1231 ms_ucs_map_f = UCS_MAP_CP932;
1248 #ifdef SHIFTJIS_CP932
1251 #ifdef UTF8_OUTPUT_ENABLE
1252 ms_ucs_map_f = UCS_MAP_CP932;
1258 #ifdef SHIFTJIS_CP932
1261 #ifdef UTF8_OUTPUT_ENABLE
1262 ms_ucs_map_f = UCS_MAP_CP10001;
1266 #ifdef SHIFTJIS_CP932
1269 #ifdef UTF8_OUTPUT_ENABLE
1270 ms_ucs_map_f = UCS_MAP_CP932;
1274 #ifdef SHIFTJIS_CP932
1277 #ifdef UTF8_OUTPUT_ENABLE
1278 ms_ucs_map_f = UCS_MAP_MS;
1282 #ifdef SHIFTJIS_CP932
1285 #ifdef UTF8_OUTPUT_ENABLE
1286 ms_ucs_map_f = UCS_MAP_ASCII;
1289 case SHIFT_JISX0213:
1290 case SHIFT_JIS_2004:
1292 #ifdef SHIFTJIS_CP932
1299 #ifdef SHIFTJIS_CP932
1303 #ifdef UTF8_INPUT_ENABLE
1304 #ifdef UNICODE_NORMALIZATION
1312 input_endian = ENDIAN_BIG;
1316 input_endian = ENDIAN_LITTLE;
1321 input_endian = ENDIAN_BIG;
1325 input_endian = ENDIAN_LITTLE;
1331 static void set_output_encoding(nkf_encoding *enc)
1333 switch (nkf_enc_to_index(enc)) {
1336 #ifdef SHIFTJIS_CP932
1337 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1339 #ifdef UTF8_OUTPUT_ENABLE
1340 ms_ucs_map_f = UCS_MAP_CP932;
1344 #ifdef SHIFTJIS_CP932
1345 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1347 #ifdef UTF8_OUTPUT_ENABLE
1348 ms_ucs_map_f = UCS_MAP_CP932;
1355 #ifdef SHIFTJIS_CP932
1356 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1364 #ifdef SHIFTJIS_CP932
1365 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1371 #ifdef UTF8_OUTPUT_ENABLE
1372 ms_ucs_map_f = UCS_MAP_CP932;
1376 #ifdef UTF8_OUTPUT_ENABLE
1377 ms_ucs_map_f = UCS_MAP_CP10001;
1382 #ifdef SHIFTJIS_CP932
1383 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1385 #ifdef UTF8_OUTPUT_ENABLE
1386 ms_ucs_map_f = UCS_MAP_CP932;
1390 #ifdef SHIFTJIS_CP932
1391 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1393 #ifdef UTF8_OUTPUT_ENABLE
1394 ms_ucs_map_f = UCS_MAP_CP932;
1401 #ifdef UTF8_OUTPUT_ENABLE
1402 ms_ucs_map_f = UCS_MAP_MS;
1409 #ifdef UTF8_OUTPUT_ENABLE
1410 ms_ucs_map_f = UCS_MAP_ASCII;
1413 case SHIFT_JISX0213:
1414 case SHIFT_JIS_2004:
1416 #ifdef SHIFTJIS_CP932
1417 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1426 #ifdef SHIFTJIS_CP932
1427 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1430 #ifdef UTF8_OUTPUT_ENABLE
1432 output_bom_f = TRUE;
1436 output_bom_f = TRUE;
1439 output_endian = ENDIAN_LITTLE;
1440 output_bom_f = FALSE;
1443 output_endian = ENDIAN_LITTLE;
1444 output_bom_f = TRUE;
1447 output_bom_f = TRUE;
1450 output_endian = ENDIAN_LITTLE;
1451 output_bom_f = FALSE;
1454 output_endian = ENDIAN_LITTLE;
1455 output_bom_f = TRUE;
1461 static int option_mode = 0;
1463 void options(unsigned char *cp)
1467 unsigned char *cp_back = NULL;
1473 while(*cp && *cp++!='-');
1474 while (*cp || cp_back) {
1482 case '-': /* literal options */
1483 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
1487 for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
1488 p = (unsigned char *)long_option[i].name;
1489 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
1490 if (*p == cp[j] || cp[j] == SP){
1497 fprintf(stderr, "unknown long option: --%s\n", cp);
1500 while(*cp && *cp != SP && cp++);
1501 if (long_option[i].alias[0]){
1503 cp = (unsigned char *)long_option[i].alias;
1505 if (strcmp(long_option[i].name, "ic=") == 0){
1506 nkf_str_upcase((char *)p, codeset, 32);
1507 enc = nkf_enc_find(codeset);
1509 input_encoding = enc;
1512 if (strcmp(long_option[i].name, "oc=") == 0){
1513 nkf_str_upcase((char *)p, codeset, 32);
1514 enc = nkf_enc_find(codeset);
1515 if (enc <= 0) continue;
1516 output_encoding = enc;
1519 if (strcmp(long_option[i].name, "guess=") == 0){
1520 if (p[0] == '0' || p[0] == '1') {
1528 if (strcmp(long_option[i].name, "overwrite") == 0){
1531 preserve_time_f = TRUE;
1534 if (strcmp(long_option[i].name, "overwrite=") == 0){
1537 preserve_time_f = TRUE;
1539 backup_suffix = malloc(strlen((char *) p) + 1);
1540 strcpy(backup_suffix, (char *) p);
1543 if (strcmp(long_option[i].name, "in-place") == 0){
1546 preserve_time_f = FALSE;
1549 if (strcmp(long_option[i].name, "in-place=") == 0){
1552 preserve_time_f = FALSE;
1554 backup_suffix = malloc(strlen((char *) p) + 1);
1555 strcpy(backup_suffix, (char *) p);
1560 if (strcmp(long_option[i].name, "cap-input") == 0){
1564 if (strcmp(long_option[i].name, "url-input") == 0){
1569 #ifdef NUMCHAR_OPTION
1570 if (strcmp(long_option[i].name, "numchar-input") == 0){
1576 if (strcmp(long_option[i].name, "no-output") == 0){
1580 if (strcmp(long_option[i].name, "debug") == 0){
1585 if (strcmp(long_option[i].name, "cp932") == 0){
1586 #ifdef SHIFTJIS_CP932
1590 #ifdef UTF8_OUTPUT_ENABLE
1591 ms_ucs_map_f = UCS_MAP_CP932;
1595 if (strcmp(long_option[i].name, "no-cp932") == 0){
1596 #ifdef SHIFTJIS_CP932
1600 #ifdef UTF8_OUTPUT_ENABLE
1601 ms_ucs_map_f = UCS_MAP_ASCII;
1605 #ifdef SHIFTJIS_CP932
1606 if (strcmp(long_option[i].name, "cp932inv") == 0){
1613 if (strcmp(long_option[i].name, "x0212") == 0){
1620 if (strcmp(long_option[i].name, "exec-in") == 0){
1624 if (strcmp(long_option[i].name, "exec-out") == 0){
1629 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
1630 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
1631 no_cp932ext_f = TRUE;
1634 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
1635 no_best_fit_chars_f = TRUE;
1638 if (strcmp(long_option[i].name, "fb-skip") == 0){
1639 encode_fallback = NULL;
1642 if (strcmp(long_option[i].name, "fb-html") == 0){
1643 encode_fallback = encode_fallback_html;
1646 if (strcmp(long_option[i].name, "fb-xml") == 0){
1647 encode_fallback = encode_fallback_xml;
1650 if (strcmp(long_option[i].name, "fb-java") == 0){
1651 encode_fallback = encode_fallback_java;
1654 if (strcmp(long_option[i].name, "fb-perl") == 0){
1655 encode_fallback = encode_fallback_perl;
1658 if (strcmp(long_option[i].name, "fb-subchar") == 0){
1659 encode_fallback = encode_fallback_subchar;
1662 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
1663 encode_fallback = encode_fallback_subchar;
1664 unicode_subchar = 0;
1666 /* decimal number */
1667 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
1668 unicode_subchar *= 10;
1669 unicode_subchar += hex2bin(p[i]);
1671 }else if(p[1] == 'x' || p[1] == 'X'){
1672 /* hexadecimal number */
1673 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
1674 unicode_subchar <<= 4;
1675 unicode_subchar |= hex2bin(p[i]);
1679 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
1680 unicode_subchar *= 8;
1681 unicode_subchar += hex2bin(p[i]);
1684 w16e_conv(unicode_subchar, &i, &j);
1685 unicode_subchar = i<<8 | j;
1689 #ifdef UTF8_OUTPUT_ENABLE
1690 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
1691 ms_ucs_map_f = UCS_MAP_MS;
1695 #ifdef UNICODE_NORMALIZATION
1696 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
1701 if (strcmp(long_option[i].name, "prefix=") == 0){
1702 if (nkf_isgraph(p[0])){
1703 for (i = 1; nkf_isgraph(p[i]); i++){
1704 prefix_table[p[i]] = p[0];
1711 case 'b': /* buffered mode */
1714 case 'u': /* non bufferd mode */
1717 case 't': /* transparent mode */
1722 } else if (*cp=='2') {
1726 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
1734 case 'j': /* JIS output */
1736 output_encoding = nkf_enc_from_index(ISO_2022_JP);
1738 case 'e': /* AT&T EUC output */
1739 output_encoding = nkf_enc_from_index(EUC_JP);
1741 case 's': /* SJIS output */
1742 output_encoding = nkf_enc_from_index(WINDOWS_31J);
1744 case 'l': /* ISO8859 Latin-1 support, no conversion */
1745 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
1746 input_encoding = nkf_enc_from_index(ISO_8859_1);
1748 case 'i': /* Kanji IN ESC-$-@/B */
1749 if (*cp=='@'||*cp=='B')
1750 kanji_intro = *cp++;
1752 case 'o': /* ASCII IN ESC-(-J/B */
1753 if (*cp=='J'||*cp=='B'||*cp=='H')
1754 ascii_intro = *cp++;
1758 bit:1 katakana->hiragana
1759 bit:2 hiragana->katakana
1761 if ('9'>= *cp && *cp>='0')
1762 hira_f |= (*cp++ -'0');
1769 #if defined(MSDOS) || defined(__OS2__)
1776 show_configuration();
1784 #ifdef UTF8_OUTPUT_ENABLE
1785 case 'w': /* UTF-8 output */
1790 output_encoding = nkf_enc_from_index(UTF_8N);
1792 output_bom_f = TRUE;
1793 output_encoding = nkf_enc_from_index(UTF_8_BOM);
1797 if ('1'== cp[0] && '6'==cp[1]) {
1800 } else if ('3'== cp[0] && '2'==cp[1]) {
1804 output_encoding = nkf_enc_from_index(UTF_8);
1809 output_endian = ENDIAN_LITTLE;
1810 } else if (cp[0] == 'B') {
1813 output_encoding = nkf_enc_from_index(enc_idx);
1818 enc_idx = enc_idx == UTF_16
1819 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
1820 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
1822 output_bom_f = TRUE;
1823 enc_idx = enc_idx == UTF_16
1824 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
1825 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
1827 output_encoding = nkf_enc_from_index(enc_idx);
1831 #ifdef UTF8_INPUT_ENABLE
1832 case 'W': /* UTF input */
1835 input_encoding = nkf_enc_from_index(UTF_8);
1838 if ('1'== cp[0] && '6'==cp[1]) {
1840 input_endian = ENDIAN_BIG;
1842 } else if ('3'== cp[0] && '2'==cp[1]) {
1844 input_endian = ENDIAN_BIG;
1847 input_encoding = nkf_enc_from_index(UTF_8);
1852 input_endian = ENDIAN_LITTLE;
1853 } else if (cp[0] == 'B') {
1855 input_endian = ENDIAN_BIG;
1857 enc_idx = enc_idx == UTF_16
1858 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
1859 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
1860 input_encoding = nkf_enc_from_index(enc_idx);
1864 /* Input code assumption */
1865 case 'J': /* ISO-2022-JP input */
1866 input_encoding = nkf_enc_from_index(ISO_2022_JP);
1868 case 'E': /* EUC-JP input */
1869 input_encoding = nkf_enc_from_index(EUC_JP);
1871 case 'S': /* Windows-31J input */
1872 input_encoding = nkf_enc_from_index(WINDOWS_31J);
1874 case 'Z': /* Convert X0208 alphabet to asii */
1876 bit:0 Convert JIS X 0208 Alphabet to ASCII
1877 bit:1 Convert Kankaku to one space
1878 bit:2 Convert Kankaku to two spaces
1879 bit:3 Convert HTML Entity
1880 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
1882 while ('0'<= *cp && *cp <='9') {
1883 alpha_f |= 1 << (*cp++ - '0');
1885 if (!alpha_f) alpha_f = 1;
1887 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
1888 x0201_f = FALSE; /* No X0201->X0208 conversion */
1890 ESC-(-I in JIS, EUC, MS Kanji
1891 SI/SO in JIS, EUC, MS Kanji
1892 SSO in EUC, JIS, not in MS Kanji
1893 MS Kanji (0xa0-0xdf)
1895 ESC-(-I in JIS (0x20-0x5f)
1896 SSO in EUC (0xa0-0xdf)
1897 0xa0-0xd in MS Kanji (0xa0-0xdf)
1900 case 'X': /* Convert X0201 kana to X0208 */
1903 case 'F': /* prserve new lines */
1904 fold_preserve_f = TRUE;
1905 case 'f': /* folding -f60 or -f */
1908 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1910 fold_len += *cp++ - '0';
1912 if (!(0<fold_len && fold_len<BUFSIZ))
1913 fold_len = DEFAULT_FOLD;
1917 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
1919 fold_margin += *cp++ - '0';
1923 case 'm': /* MIME support */
1924 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
1925 if (*cp=='B'||*cp=='Q') {
1926 mime_decode_mode = *cp++;
1927 mimebuf_f = FIXED_MIME;
1928 } else if (*cp=='N') {
1929 mime_f = TRUE; cp++;
1930 } else if (*cp=='S') {
1931 mime_f = STRICT_MIME; cp++;
1932 } else if (*cp=='0') {
1933 mime_decode_f = FALSE;
1934 mime_f = FALSE; cp++;
1936 mime_f = STRICT_MIME;
1939 case 'M': /* MIME output */
1942 mimeout_f = FIXED_MIME; cp++;
1943 } else if (*cp=='Q') {
1945 mimeout_f = FIXED_MIME; cp++;
1950 case 'B': /* Broken JIS support */
1952 bit:1 allow any x on ESC-(-x or ESC-$-x
1953 bit:2 reset to ascii on NL
1955 if ('9'>= *cp && *cp>='0')
1956 broken_f |= 1<<(*cp++ -'0');
1961 case 'O':/* for Output file */
1965 case 'c':/* add cr code */
1968 case 'd':/* delete cr code */
1971 case 'I': /* ISO-2022-JP output */
1974 case 'L': /* line mode */
1975 if (*cp=='u') { /* unix */
1976 nlmode_f = LF; cp++;
1977 } else if (*cp=='m') { /* mac */
1978 nlmode_f = CR; cp++;
1979 } else if (*cp=='w') { /* windows */
1980 nlmode_f = CRLF; cp++;
1981 } else if (*cp=='0') { /* no conversion */
1987 if ('2' <= *cp && *cp <= '9') {
1990 } else if (*cp == '0' || *cp == '1') {
1999 /* module muliple options in a string are allowed for Perl moudle */
2000 while(*cp && *cp++!='-');
2003 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
2004 /* bogus option but ignored */
2010 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2013 struct input_code *p = input_code_list;
2015 if (iconv_func == p->iconv_func){
2024 void set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
2026 #ifdef INPUT_CODE_FIX
2027 if (f || !input_encoding)
2034 #ifdef INPUT_CODE_FIX
2035 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
2041 if (estab_f && iconv_for_check != iconv){
2042 struct input_code *p = find_inputcode_byfunc(iconv);
2044 set_input_codename(p->name);
2047 iconv_for_check = iconv;
2052 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2053 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2054 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2055 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2056 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2057 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2058 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2059 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2061 #define SCORE_INIT (SCORE_iMIME)
2063 static const char score_table_A0[] = {
2066 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2067 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2070 static const char score_table_F0[] = {
2071 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2072 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2073 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2074 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2077 void set_code_score(struct input_code *ptr, nkf_char score)
2080 ptr->score |= score;
2084 void clr_code_score(struct input_code *ptr, nkf_char score)
2087 ptr->score &= ~score;
2091 void code_score(struct input_code *ptr)
2093 nkf_char c2 = ptr->buf[0];
2094 #ifdef UTF8_OUTPUT_ENABLE
2095 nkf_char c1 = ptr->buf[1];
2098 set_code_score(ptr, SCORE_ERROR);
2099 }else if (c2 == SSO){
2100 set_code_score(ptr, SCORE_KANA);
2101 }else if (c2 == 0x8f){
2102 set_code_score(ptr, SCORE_X0212);
2103 #ifdef UTF8_OUTPUT_ENABLE
2104 }else if (!e2w_conv(c2, c1)){
2105 set_code_score(ptr, SCORE_NO_EXIST);
2107 }else if ((c2 & 0x70) == 0x20){
2108 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2109 }else if ((c2 & 0x70) == 0x70){
2110 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2111 }else if ((c2 & 0x70) >= 0x50){
2112 set_code_score(ptr, SCORE_L2);
2116 void status_disable(struct input_code *ptr)
2121 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2124 void status_push_ch(struct input_code *ptr, nkf_char c)
2126 ptr->buf[ptr->index++] = c;
2129 void status_clear(struct input_code *ptr)
2135 void status_reset(struct input_code *ptr)
2138 ptr->score = SCORE_INIT;
2141 void status_reinit(struct input_code *ptr)
2144 ptr->_file_stat = 0;
2147 void status_check(struct input_code *ptr, nkf_char c)
2149 if (c <= DEL && estab_f){
2154 void s_status(struct input_code *ptr, nkf_char c)
2158 status_check(ptr, c);
2163 #ifdef NUMCHAR_OPTION
2164 }else if (is_unicode_capsule(c)){
2167 }else if (0xa1 <= c && c <= 0xdf){
2168 status_push_ch(ptr, SSO);
2169 status_push_ch(ptr, c);
2172 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2174 status_push_ch(ptr, c);
2175 }else if (0xed <= c && c <= 0xee){
2177 status_push_ch(ptr, c);
2178 #ifdef SHIFTJIS_CP932
2179 }else if (is_ibmext_in_sjis(c)){
2181 status_push_ch(ptr, c);
2182 #endif /* SHIFTJIS_CP932 */
2184 }else if (0xf0 <= c && c <= 0xfc){
2186 status_push_ch(ptr, c);
2187 #endif /* X0212_ENABLE */
2189 status_disable(ptr);
2193 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2194 status_push_ch(ptr, c);
2195 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2199 status_disable(ptr);
2203 #ifdef SHIFTJIS_CP932
2204 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2205 status_push_ch(ptr, c);
2206 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2207 set_code_score(ptr, SCORE_CP932);
2212 #endif /* SHIFTJIS_CP932 */
2213 status_disable(ptr);
2216 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2217 status_push_ch(ptr, c);
2218 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2219 set_code_score(ptr, SCORE_CP932);
2222 status_disable(ptr);
2228 void e_status(struct input_code *ptr, nkf_char c)
2232 status_check(ptr, c);
2237 #ifdef NUMCHAR_OPTION
2238 }else if (is_unicode_capsule(c)){
2241 }else if (SSO == c || (0xa1 <= c && c <= 0xfe)){
2243 status_push_ch(ptr, c);
2245 }else if (0x8f == c){
2247 status_push_ch(ptr, c);
2248 #endif /* X0212_ENABLE */
2250 status_disable(ptr);
2254 if (0xa1 <= c && c <= 0xfe){
2255 status_push_ch(ptr, c);
2259 status_disable(ptr);
2264 if (0xa1 <= c && c <= 0xfe){
2266 status_push_ch(ptr, c);
2268 status_disable(ptr);
2270 #endif /* X0212_ENABLE */
2274 #ifdef UTF8_INPUT_ENABLE
2275 void w_status(struct input_code *ptr, nkf_char c)
2279 status_check(ptr, c);
2284 #ifdef NUMCHAR_OPTION
2285 }else if (is_unicode_capsule(c)){
2288 }else if (0xc0 <= c && c <= 0xdf){
2290 status_push_ch(ptr, c);
2291 }else if (0xe0 <= c && c <= 0xef){
2293 status_push_ch(ptr, c);
2294 }else if (0xf0 <= c && c <= 0xf4){
2296 status_push_ch(ptr, c);
2298 status_disable(ptr);
2303 if (0x80 <= c && c <= 0xbf){
2304 status_push_ch(ptr, c);
2305 if (ptr->index > ptr->stat){
2306 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2307 && ptr->buf[2] == 0xbf);
2308 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2309 &ptr->buf[0], &ptr->buf[1]);
2316 status_disable(ptr);
2320 if (0x80 <= c && c <= 0xbf){
2321 if (ptr->index < ptr->stat){
2322 status_push_ch(ptr, c);
2327 status_disable(ptr);
2334 void code_status(nkf_char c)
2336 int action_flag = 1;
2337 struct input_code *result = 0;
2338 struct input_code *p = input_code_list;
2340 if (!p->status_func) {
2344 if (!p->status_func)
2346 (p->status_func)(p, c);
2349 }else if(p->stat == 0){
2360 if (result && !estab_f){
2361 set_iconv(TRUE, result->iconv_func);
2362 }else if (c <= DEL){
2363 struct input_code *ptr = input_code_list;
2373 nkf_char std_getc(FILE *f)
2376 return std_gc_buf[--std_gc_ndx];
2382 nkf_char std_ungetc(nkf_char c, FILE *f)
2384 if (std_gc_ndx == STD_GC_BUFSIZE){
2387 std_gc_buf[std_gc_ndx++] = c;
2392 void std_putc(nkf_char c)
2399 #if !defined(PERL_XS) && !defined(WIN32DLL)
2400 nkf_char noconvert(FILE *f)
2405 module_connection();
2406 while ((c = (*i_getc)(f)) != EOF)
2413 void module_connection(void)
2415 if (input_encoding) set_input_encoding(input_encoding);
2416 if (!output_encoding) {
2417 output_encoding = nkf_default_encoding();
2419 set_output_encoding(output_encoding);
2420 oconv = nkf_enc_to_oconv(output_encoding);
2423 /* replace continucation module, from output side */
2425 /* output redicrection */
2427 if (noout_f || guess_f){
2434 if (mimeout_f == TRUE) {
2435 o_base64conv = oconv; oconv = base64_conv;
2437 /* base64_count = 0; */
2440 if (nlmode_f || guess_f) {
2441 o_nlconv = oconv; oconv = nl_conv;
2444 o_rot_conv = oconv; oconv = rot_conv;
2447 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
2450 o_hira_conv = oconv; oconv = hira_conv;
2453 o_fconv = oconv; oconv = fold_conv;
2456 if (alpha_f || x0201_f) {
2457 o_zconv = oconv; oconv = z_conv;
2461 i_ungetc = std_ungetc;
2462 /* input redicrection */
2465 i_cgetc = i_getc; i_getc = cap_getc;
2466 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
2469 i_ugetc = i_getc; i_getc = url_getc;
2470 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
2473 #ifdef NUMCHAR_OPTION
2475 i_ngetc = i_getc; i_getc = numchar_getc;
2476 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
2479 #ifdef UNICODE_NORMALIZATION
2481 i_nfc_getc = i_getc; i_getc = nfc_getc;
2482 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
2485 if (mime_f && mimebuf_f==FIXED_MIME) {
2486 i_mgetc = i_getc; i_getc = mime_getc;
2487 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
2490 i_bgetc = i_getc; i_getc = broken_getc;
2491 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
2493 if (input_encoding) {
2494 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
2496 set_iconv(FALSE, e_iconv);
2500 struct input_code *p = input_code_list;
2508 * Check and Ignore BOM
2510 void check_bom(FILE *f)
2513 switch(c2 = (*i_getc)(f)){
2515 if((c2 = (*i_getc)(f)) == 0x00){
2516 if((c2 = (*i_getc)(f)) == 0xFE){
2517 if((c2 = (*i_getc)(f)) == 0xFF){
2518 if(!input_encoding){
2519 set_iconv(TRUE, w_iconv32);
2521 if (iconv == w_iconv32) {
2522 input_endian = ENDIAN_BIG;
2525 (*i_ungetc)(0xFF,f);
2526 }else (*i_ungetc)(c2,f);
2527 (*i_ungetc)(0xFE,f);
2528 }else if(c2 == 0xFF){
2529 if((c2 = (*i_getc)(f)) == 0xFE){
2530 if(!input_encoding){
2531 set_iconv(TRUE, w_iconv32);
2533 if (iconv == w_iconv32) {
2534 input_endian = ENDIAN_2143;
2537 (*i_ungetc)(0xFF,f);
2538 }else (*i_ungetc)(c2,f);
2539 (*i_ungetc)(0xFF,f);
2540 }else (*i_ungetc)(c2,f);
2541 (*i_ungetc)(0x00,f);
2542 }else (*i_ungetc)(c2,f);
2543 (*i_ungetc)(0x00,f);
2546 if((c2 = (*i_getc)(f)) == 0xBB){
2547 if((c2 = (*i_getc)(f)) == 0xBF){
2548 if(!input_encoding){
2549 set_iconv(TRUE, w_iconv);
2551 if (iconv == w_iconv) {
2554 (*i_ungetc)(0xBF,f);
2555 }else (*i_ungetc)(c2,f);
2556 (*i_ungetc)(0xBB,f);
2557 }else (*i_ungetc)(c2,f);
2558 (*i_ungetc)(0xEF,f);
2561 if((c2 = (*i_getc)(f)) == 0xFF){
2562 if((c2 = (*i_getc)(f)) == 0x00){
2563 if((c2 = (*i_getc)(f)) == 0x00){
2564 if(!input_encoding){
2565 set_iconv(TRUE, w_iconv32);
2567 if (iconv == w_iconv32) {
2568 input_endian = ENDIAN_3412;
2571 (*i_ungetc)(0x00,f);
2572 }else (*i_ungetc)(c2,f);
2573 (*i_ungetc)(0x00,f);
2574 }else (*i_ungetc)(c2,f);
2575 if(!input_encoding){
2576 set_iconv(TRUE, w_iconv16);
2578 if (iconv == w_iconv16) {
2579 input_endian = ENDIAN_BIG;
2582 (*i_ungetc)(0xFF,f);
2583 }else (*i_ungetc)(c2,f);
2584 (*i_ungetc)(0xFE,f);
2587 if((c2 = (*i_getc)(f)) == 0xFE){
2588 if((c2 = (*i_getc)(f)) == 0x00){
2589 if((c2 = (*i_getc)(f)) == 0x00){
2590 if(!input_encoding){
2591 set_iconv(TRUE, w_iconv32);
2593 if (iconv == w_iconv32) {
2594 input_endian = ENDIAN_LITTLE;
2597 (*i_ungetc)(0x00,f);
2598 }else (*i_ungetc)(c2,f);
2599 (*i_ungetc)(0x00,f);
2600 }else (*i_ungetc)(c2,f);
2601 if(!input_encoding){
2602 set_iconv(TRUE, w_iconv16);
2604 if (iconv == w_iconv16) {
2605 input_endian = ENDIAN_LITTLE;
2608 (*i_ungetc)(0xFE,f);
2609 }else (*i_ungetc)(c2,f);
2610 (*i_ungetc)(0xFF,f);
2619 Conversion main loop. Code detection only.
2622 nkf_char kanji_convert(FILE *f)
2624 nkf_char c3, c2=0, c1, c0=0;
2625 int is_8bit = FALSE;
2627 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
2632 output_mode = ASCII;
2635 #define NEXT continue /* no output, get next */
2636 #define SEND ; /* output c1 and c2, get next */
2637 #define LAST break /* end of loop, go closing */
2639 module_connection();
2642 while ((c1 = (*i_getc)(f)) != EOF) {
2643 #ifdef INPUT_CODE_FIX
2644 if (!input_encoding)
2649 if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
2650 /* in case of 8th bit is on */
2651 if (!estab_f&&!mime_decode_mode) {
2652 /* in case of not established yet */
2653 /* It is still ambiguious */
2654 if (h_conv(f, c2, c1)==EOF)
2660 /* in case of already established */
2662 /* ignore bogus code and not CP5022x UCD */
2670 /* second byte, 7 bit code */
2671 /* it might be kanji shitfted */
2672 if ((c1 == DEL) || (c1 <= SP)) {
2673 /* ignore bogus first code */
2680 #ifdef UTF8_INPUT_ENABLE
2681 if (iconv == w_iconv16) {
2682 if (input_endian == ENDIAN_BIG) {
2684 if ((c1 = (*i_getc)(f)) != EOF) {
2685 if (0xD8 <= c2 && c2 <= 0xDB) {
2686 if ((c0 = (*i_getc)(f)) != EOF) {
2688 if ((c3 = (*i_getc)(f)) != EOF) {
2695 if ((c2 = (*i_getc)(f)) != EOF) {
2696 if (0xD8 <= c2 && c2 <= 0xDB) {
2697 if ((c3 = (*i_getc)(f)) != EOF) {
2698 if ((c0 = (*i_getc)(f)) != EOF) {
2707 } else if(iconv == w_iconv32){
2709 if((c2 = (*i_getc)(f)) != EOF &&
2710 (c1 = (*i_getc)(f)) != EOF &&
2711 (c0 = (*i_getc)(f)) != EOF){
2712 switch(input_endian){
2714 c1 = (c2&0xFF)<<16 | (c1&0xFF)<<8 | (c0&0xFF);
2717 c1 = (c3&0xFF) | (c2&0xFF)<<8 | (c1&0xFF)<<16;
2720 c1 = (c3&0xFF)<<16 | (c1&0xFF) | (c0&0xFF)<<8;
2723 c1 = (c3&0xFF)<<8 | (c2&0xFF) | (c0&0xFF)<<16;
2733 #ifdef NUMCHAR_OPTION
2734 if (is_unicode_capsule(c1)){
2738 if (c1 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
2740 if (!estab_f && !iso8859_f) {
2741 /* not established yet */
2744 } else { /* estab_f==TRUE */
2749 } else if (SSP<=c1 && c1<0xe0 && iconv == s_iconv) {
2750 /* SJIS X0201 Case... */
2751 if (iso2022jp_f && !x0201_f) {
2752 (*oconv)(GETA1, GETA2);
2759 } else if (c1==SSO && iconv != s_iconv) {
2760 /* EUC X0201 Case */
2761 c1 = (*i_getc)(f); /* skip SSO */
2763 if (SSP<=c1 && c1<0xe0) {
2764 if (iso2022jp_f && !x0201_f) {
2765 (*oconv)(GETA1, GETA2);
2772 } else { /* bogus code, skip SSO and one byte */
2775 } else if (ms_ucs_map_f == UCS_MAP_CP10001 &&
2776 (c1 == 0xFD || c1 == 0xFE)) {
2782 /* already established */
2787 } else if ((c1 > SP) && (c1 != DEL)) {
2788 /* in case of Roman characters */
2790 /* output 1 shifted byte */
2794 } else if (SP <= c1 && c1 < (0xe0&0x7f)){
2795 /* output 1 shifted byte */
2796 if (iso2022jp_f && !x0201_f) {
2797 (*oconv)(GETA1, GETA2);
2804 /* look like bogus code */
2807 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
2808 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
2809 /* in case of Kanji shifted */
2812 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
2813 /* Check MIME code */
2814 if ((c1 = (*i_getc)(f)) == EOF) {
2817 } else if (c1 == '?') {
2818 /* =? is mime conversion start sequence */
2819 if(mime_f == STRICT_MIME) {
2820 /* check in real detail */
2821 if (mime_begin_strict(f) == EOF)
2825 } else if (mime_begin(f) == EOF)
2835 /* normal ASCII code */
2838 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
2841 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
2844 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
2845 if ((c1 = (*i_getc)(f)) == EOF) {
2846 /* (*oconv)(0, ESC); don't send bogus code */
2848 } else if (c1 == '$') {
2849 if ((c1 = (*i_getc)(f)) == EOF) {
2851 (*oconv)(0, ESC); don't send bogus code
2852 (*oconv)(0, '$'); */
2854 } else if (c1 == '@'|| c1 == 'B') {
2855 /* This is kanji introduction */
2856 input_mode = JIS_X_0208;
2858 set_input_codename("ISO-2022-JP");
2860 debug("ISO-2022-JP");
2863 } else if (c1 == '(') {
2864 if ((c1 = (*i_getc)(f)) == EOF) {
2865 /* don't send bogus code
2871 } else if (c1 == '@'|| c1 == 'B') {
2872 /* This is kanji introduction */
2873 input_mode = JIS_X_0208;
2877 } else if (c1 == 'D'){
2878 input_mode = JIS_X_0212;
2881 #endif /* X0212_ENABLE */
2882 } else if (c1 == 0x4F){
2883 input_mode = JIS_X_0213_1;
2886 } else if (c1 == 0x50){
2887 input_mode = JIS_X_0213_2;
2891 /* could be some special code */
2898 } else if (broken_f&0x2) {
2899 /* accept any ESC-(-x as broken code ... */
2900 input_mode = JIS_X_0208;
2909 } else if (c1 == '(') {
2910 if ((c1 = (*i_getc)(f)) == EOF) {
2911 /* don't send bogus code
2913 (*oconv)(0, '('); */
2917 /* This is X0201 kana introduction */
2918 input_mode = JIS_X_0201; shift_mode = JIS_X_0201;
2920 } else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
2921 /* This is X0208 kanji introduction */
2922 input_mode = ASCII; shift_mode = FALSE;
2924 } else if (broken_f&0x2) {
2925 input_mode = ASCII; shift_mode = FALSE;
2930 /* maintain various input_mode here */
2934 } else if ( c1 == 'N' || c1 == 'n'){
2936 c3 = (*i_getc)(f); /* skip SS2 */
2937 if ( (SP<=c3 && c3 < 0x60) || (0xa0<=c3 && c3 < 0xe0)){
2952 } else if (c1 == ESC && iconv == s_iconv) {
2953 /* ESC in Shift_JIS */
2954 if ((c1 = (*i_getc)(f)) == EOF) {
2955 /* (*oconv)(0, ESC); don't send bogus code */
2957 } else if (c1 == '$') {
2959 if ((c1 = (*i_getc)(f)) == EOF) {
2961 (*oconv)(0, ESC); don't send bogus code
2962 (*oconv)(0, '$'); */
2965 if (('E' <= c1 && c1 <= 'G') ||
2966 ('O' <= c1 && c1 <= 'Q')) {
2974 static const char jphone_emoji_first_table[7] = {2, 0, 3, 4, 5, 0, 1};
2975 c0 = (jphone_emoji_first_table[c1 % 7] << 8) - SP + 0xE000 + CLASS_UNICODE;
2976 while ((c1 = (*i_getc)(f)) != EOF) {
2977 if (SP <= c1 && c1 <= 'z') {
2978 (*oconv)(0, c1 + c0);
2979 } else break; /* c1 == SO */
2983 if (c1 == EOF) LAST;
2990 } else if (c1 == LF || c1 == CR) {
2992 input_mode = ASCII; set_iconv(FALSE, 0);
2994 } else if (mime_decode_f && !mime_decode_mode){
2996 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
3004 } else { /* if (c1 == CR)*/
3005 if ((c1=(*i_getc)(f))!=EOF) {
3009 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
3023 } else if (c1 == DEL && input_mode == JIS_X_0208) {
3033 switch ((*iconv)(c2, c1, c0)) { /* can be EUC / SJIS / UTF-8 / UTF-16 */
3036 if ((c0 = (*i_getc)(f)) != EOF) {
3039 if ((c3 = (*i_getc)(f)) != EOF) {
3041 (*iconv)(c2, c1, c0|c3);
3046 /* 3 bytes EUC or UTF-8 */
3047 if ((c0 = (*i_getc)(f)) != EOF) {
3049 (*iconv)(c2, c1, c0);
3057 0x7F <= c2 && c2 <= 0x92 &&
3058 0x21 <= c1 && c1 <= 0x7E) {
3060 if(c1 == 0x7F) return 0;
3061 c1 = (c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000 + CLASS_UNICODE;
3064 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
3068 (*oconv)(PREFIX_EUCG3 | c2, c1);
3070 #endif /* X0212_ENABLE */
3072 (*oconv)(PREFIX_EUCG3 | c2, c1);
3075 (*oconv)(input_mode, c1); /* other special case */
3081 /* goto next_word */
3085 (*iconv)(EOF, 0, 0);
3086 if (!input_codename)
3089 struct input_code *p = input_code_list;
3090 struct input_code *result = p;
3092 if (p->score < result->score) result = p;
3095 set_input_codename(result->name);
3097 debug(result->name);
3105 h_conv(FILE *f, nkf_char c2, nkf_char c1)
3107 nkf_char ret, c3, c0;
3111 /** it must NOT be in the kanji shifte sequence */
3112 /** it must NOT be written in JIS7 */
3113 /** and it must be after 2 byte 8bit code */
3119 while ((c1 = (*i_getc)(f)) != EOF) {
3125 if (push_hold_buf(c1) == EOF || estab_f){
3131 struct input_code *p = input_code_list;
3132 struct input_code *result = p;
3137 if (p->status_func && p->score < result->score){
3142 set_iconv(TRUE, result->iconv_func);
3147 ** 1) EOF is detected, or
3148 ** 2) Code is established, or
3149 ** 3) Buffer is FULL (but last word is pushed)
3151 ** in 1) and 3) cases, we continue to use
3152 ** Kanji codes by oconv and leave estab_f unchanged.
3157 while (hold_index < hold_count){
3158 c2 = hold_buf[hold_index++];
3160 #ifdef NUMCHAR_OPTION
3161 || is_unicode_capsule(c2)
3166 }else if (iconv == s_iconv && 0xa1 <= c2 && c2 <= 0xdf){
3167 (*iconv)(JIS_X_0201, c2, 0);
3170 if (hold_index < hold_count){
3171 c1 = hold_buf[hold_index++];
3181 switch ((*iconv)(c2, c1, 0)) { /* can be EUC/SJIS/UTF-8 */
3184 if (hold_index < hold_count){
3185 c0 = hold_buf[hold_index++];
3186 } else if ((c0 = (*i_getc)(f)) == EOF) {
3192 if (hold_index < hold_count){
3193 c3 = hold_buf[hold_index++];
3194 } else if ((c3 = (*i_getc)(f)) == EOF) {
3199 (*iconv)(c2, c1, c0|c3);
3204 /* 3 bytes EUC or UTF-8 */
3205 if (hold_index < hold_count){
3206 c0 = hold_buf[hold_index++];
3207 } else if ((c0 = (*i_getc)(f)) == EOF) {
3213 (*iconv)(c2, c1, c0);
3216 if (c0 == EOF) break;
3221 nkf_char push_hold_buf(nkf_char c2)
3223 if (hold_count >= HOLD_SIZE*2)
3225 hold_buf[hold_count++] = (unsigned char)c2;
3226 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3229 nkf_char s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
3231 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
3234 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
3235 #ifdef SHIFTJIS_CP932
3236 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
3237 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
3244 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
3245 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
3251 #endif /* SHIFTJIS_CP932 */
3253 if (!x0213_f && is_ibmext_in_sjis(c2)){
3254 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
3257 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
3270 if(x0213_f && c2 >= 0xF0){
3271 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
3272 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
3273 }else{ /* 78<=k<=94 */
3274 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
3275 if (0x9E < c1) c2++;
3278 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
3279 #define SJ6394 0x0161 /* 63 - 94 ku offset */
3280 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
3281 if (0x9E < c1) c2++;
3284 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
3291 c2 = x0212_unshift(c2);
3298 nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3300 if (c2 == JIS_X_0201) {
3302 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3304 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
3306 if(c1 == 0x7F) return 0;
3307 c1 = (c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000 + CLASS_UNICODE;
3310 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
3311 if (ret) return ret;
3317 nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3319 if (c2 == JIS_X_0201) {
3322 }else if (c2 == 0x8f){
3326 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
3327 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3328 c1 = (c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC + CLASS_UNICODE;
3331 c2 = (c2 << 8) | (c1 & 0x7f);
3333 #ifdef SHIFTJIS_CP932
3336 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3337 s2e_conv(s2, s1, &c2, &c1);
3344 #endif /* SHIFTJIS_CP932 */
3346 #endif /* X0212_ENABLE */
3347 } else if (c2 == SSO){
3350 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
3353 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
3354 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
3355 c1 = (c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000 + CLASS_UNICODE;
3360 #ifdef SHIFTJIS_CP932
3361 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
3363 if (e2s_conv(c2, c1, &s2, &s1) == 0){
3364 s2e_conv(s2, s1, &c2, &c1);
3371 #endif /* SHIFTJIS_CP932 */
3378 #ifdef UTF8_INPUT_ENABLE
3379 nkf_char w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3386 }else if (0xc0 <= c2 && c2 <= 0xef) {
3387 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3388 #ifdef NUMCHAR_OPTION
3391 if (p1) *p1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3399 nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
3402 static const char w_iconv_utf8_1st_byte[] =
3404 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3405 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
3406 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
3407 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
3409 if (c2 < 0 || 0xff < c2) {
3410 }else if (c2 == 0) { /* 0 : 1 byte*/
3412 } else if ((c2 & 0xc0) == 0x80) { /* 0x80-0xbf : trail byte */
3415 switch (w_iconv_utf8_1st_byte[c2 - 0xC0]) {
3417 if (c1 < 0x80 || 0xBF < c1) return 0;
3420 if (c0 == 0) return -1;
3421 if (c1 < 0xA0 || 0xBF < c1 || (c0 & 0xc0) != 0x80)
3426 if (c0 == 0) return -1;
3427 if ((c1 & 0xc0) != 0x80 || (c0 & 0xc0) != 0x80)
3431 if (c0 == 0) return -1;
3432 if (c1 < 0x80 || 0x9F < c1 || (c0 & 0xc0) != 0x80)
3436 if (c0 == 0) return -2;
3437 if (c1 < 0x90 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3441 if (c0 == 0) return -2;
3442 if (c1 < 0x80 || 0xBF < c1 || (c0 & 0xc0c0) != 0x8080)
3446 if (c0 == 0) return -2;
3447 if (c1 < 0x80 || 0x8F < c1 || (c0 & 0xc0c0) != 0x8080)
3455 if (c2 == 0 || c2 == EOF){
3456 } else if ((c2 & 0xf8) == 0xf0) { /* 4 bytes */
3457 c1 = CLASS_UNICODE | ww16_conv(c2, c1, c0);
3460 ret = w2e_conv(c2, c1, c0, &c2, &c1);
3469 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
3470 void w16w_conv(nkf_char val, nkf_char *p2, nkf_char *p1, nkf_char *p0)
3477 }else if (val < 0x800){
3478 *p2 = 0xc0 | (val >> 6);
3479 *p1 = 0x80 | (val & 0x3f);
3481 } else if (val <= NKF_INT32_C(0xFFFF)) {
3482 *p2 = 0xe0 | (val >> 12);
3483 *p1 = 0x80 | ((val >> 6) & 0x3f);
3484 *p0 = 0x80 | (val & 0x3f);
3485 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3486 *p2 = 0xe0 | (val >> 16);
3487 *p1 = 0x80 | ((val >> 12) & 0x3f);
3488 *p0 = 0x8080 | ((val << 2) & 0x3f00)| (val & 0x3f);
3497 #ifdef UTF8_INPUT_ENABLE
3498 nkf_char ww16_conv(nkf_char c2, nkf_char c1, nkf_char c0)
3503 } else if (c2 >= 0xf0){
3504 /* c2: 1st, c1: 2nd, c0: 3rd/4th */
3505 val = (c2 & 0x0f) << 18;
3506 val |= (c1 & 0x3f) << 12;
3507 val |= (c0 & 0x3f00) >> 2;
3509 }else if (c2 >= 0xe0){
3510 val = (c2 & 0x0f) << 12;
3511 val |= (c1 & 0x3f) << 6;
3513 }else if (c2 >= 0xc0){
3514 val = (c2 & 0x1f) << 6;
3522 nkf_char w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
3524 nkf_char c2, c1, c0;
3531 w16w_conv(val, &c2, &c1, &c0);
3532 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
3533 #ifdef NUMCHAR_OPTION
3536 *p1 = CLASS_UNICODE | val;
3545 #ifdef UTF8_INPUT_ENABLE
3546 nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
3549 if ((c2==0 && c1 < 0x80) || c2==EOF) {
3552 }else if (0xD8 <= c2 && c2 <= 0xDB) {
3553 if (c0 < NKF_INT32_C(0xDC00) || NKF_INT32_C(0xDFFF) < c0)
3555 c1 = CLASS_UNICODE | ((c2 << 18) + (c1 << 10) + c0 - NKF_INT32_C(0x35FDC00));
3557 }else if ((c2>>3) == 27) { /* unpaired surrogate */
3562 }else ret = w16e_conv(((c2 & 0xff)<<8) + c1, &c2, &c1);
3563 if (ret) return ret;
3568 nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
3572 if ((c2 == 0 && c1 < 0x80) || c2==EOF) {
3573 } else if (is_unicode_bmp(c1)) {
3574 ret = w16e_conv(c1, &c2, &c1);
3577 c1 = CLASS_UNICODE | c1;
3579 if (ret) return ret;
3584 nkf_char unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
3586 const unsigned short *const *pp;
3587 const unsigned short *const *const *ppp;
3588 static const char no_best_fit_chars_table_C2[] =
3589 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3590 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3591 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
3592 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
3593 static const char no_best_fit_chars_table_C2_ms[] =
3594 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3595 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3596 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
3597 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
3598 static const char no_best_fit_chars_table_932_C2[] =
3599 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3600 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3601 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
3602 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
3603 static const char no_best_fit_chars_table_932_C3[] =
3604 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3605 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
3606 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3607 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
3613 }else if(c2 < 0xe0){
3614 if(no_best_fit_chars_f){
3615 if(ms_ucs_map_f == UCS_MAP_CP932){
3618 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
3621 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3624 }else if(!cp932inv_f){
3627 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
3630 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
3633 }else if(ms_ucs_map_f == UCS_MAP_MS){
3634 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
3635 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3653 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
3654 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
3655 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
3657 ret = w_iconv_common(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
3658 }else if(c0 < 0xF0){
3659 if(no_best_fit_chars_f){
3660 if(ms_ucs_map_f == UCS_MAP_CP932){
3661 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
3662 }else if(ms_ucs_map_f == UCS_MAP_MS){
3667 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
3670 if(c0 == 0x92) return 1;
3675 if(c1 == 0x80 || c0 == 0x9C) return 1;
3678 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
3683 if(c0 == 0x94) return 1;
3686 if(c0 == 0xBB) return 1;
3696 if(c0 == 0x95) return 1;
3699 if(c0 == 0xA5) return 1;
3706 if(c0 == 0x8D) return 1;
3709 if(c0 == 0x9E && !cp932inv_f) return 1;
3712 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
3720 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
3721 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
3722 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
3724 ret = w_iconv_common(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
3726 #ifdef SHIFTJIS_CP932
3727 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
3729 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
3730 s2e_conv(s2, s1, p2, p1);
3739 nkf_char w_iconv_common(nkf_char c1, nkf_char c0, const unsigned short *const *pp, nkf_char psize, nkf_char *p2, nkf_char *p1)
3742 const unsigned short *p;
3745 if (pp == 0) return 1;
3748 if (c1 < 0 || psize <= c1) return 1;
3750 if (p == 0) return 1;
3753 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
3755 if (val == 0) return 1;
3756 if (no_cp932ext_f && (
3757 (val>>8) == 0x2D || /* NEC special characters */
3758 val > NKF_INT32_C(0xF300) /* IBM extended characters */
3766 if (c2 == SO) c2 = JIS_X_0201;
3773 void nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
3780 (*f)(0, bin2hex(c>>shift));
3790 void encode_fallback_html(nkf_char c)
3795 if(c >= NKF_INT32_C(1000000))
3796 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
3797 if(c >= NKF_INT32_C(100000))
3798 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
3800 (*oconv)(0, 0x30+(c/10000 )%10);
3802 (*oconv)(0, 0x30+(c/1000 )%10);
3804 (*oconv)(0, 0x30+(c/100 )%10);
3806 (*oconv)(0, 0x30+(c/10 )%10);
3808 (*oconv)(0, 0x30+ c %10);
3813 void encode_fallback_xml(nkf_char c)
3818 nkf_each_char_to_hex(oconv, c);
3823 void encode_fallback_java(nkf_char c)
3827 if(!is_unicode_bmp(c)){
3831 (*oconv)(0, bin2hex(c>>20));
3832 (*oconv)(0, bin2hex(c>>16));
3836 (*oconv)(0, bin2hex(c>>12));
3837 (*oconv)(0, bin2hex(c>> 8));
3838 (*oconv)(0, bin2hex(c>> 4));
3839 (*oconv)(0, bin2hex(c ));
3843 void encode_fallback_perl(nkf_char c)
3848 nkf_each_char_to_hex(oconv, c);
3853 void encode_fallback_subchar(nkf_char c)
3855 c = unicode_subchar;
3856 (*oconv)((c>>8)&0xFF, c&0xFF);
3861 #ifdef UTF8_OUTPUT_ENABLE
3862 nkf_char e2w_conv(nkf_char c2, nkf_char c1)
3864 const unsigned short *p;
3866 if (c2 == JIS_X_0201) {
3867 if (ms_ucs_map_f == UCS_MAP_CP10001) {
3875 p = euc_to_utf8_1byte;
3877 } else if (is_eucg3(c2)){
3878 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
3881 c2 = (c2&0x7f) - 0x21;
3882 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3883 p = x0212_to_utf8_2bytes[c2];
3889 c2 = (c2&0x7f) - 0x21;
3890 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
3892 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
3893 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
3894 euc_to_utf8_2bytes_ms[c2];
3899 c1 = (c1 & 0x7f) - 0x21;
3900 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
3905 void w_oconv(nkf_char c2, nkf_char c1)
3911 output_bom_f = FALSE;
3922 #ifdef NUMCHAR_OPTION
3923 if (c2 == 0 && is_unicode_capsule(c1)){
3924 val = c1 & VALUE_MASK;
3927 }else if (val < 0x800){
3928 (*o_putc)(0xC0 | (val >> 6));
3929 (*o_putc)(0x80 | (val & 0x3f));
3930 } else if (val <= NKF_INT32_C(0xFFFF)) {
3931 (*o_putc)(0xE0 | (val >> 12));
3932 (*o_putc)(0x80 | ((val >> 6) & 0x3f));
3933 (*o_putc)(0x80 | (val & 0x3f));
3934 } else if (val <= NKF_INT32_C(0x10FFFF)) {
3935 (*o_putc)(0xF0 | ( val>>18));
3936 (*o_putc)(0x80 | ((val>>12) & 0x3f));
3937 (*o_putc)(0x80 | ((val>> 6) & 0x3f));
3938 (*o_putc)(0x80 | ( val & 0x3f));
3945 output_mode = ASCII;
3947 } else if (c2 == ISO_8859_1) {
3948 output_mode = UTF_8;
3949 (*o_putc)(c1 | 0x080);
3951 output_mode = UTF_8;
3952 val = e2w_conv(c2, c1);
3954 w16w_conv(val, &c2, &c1, &c0);
3958 if (c0) (*o_putc)(c0);
3964 void w_oconv16(nkf_char c2, nkf_char c1)
3967 output_bom_f = FALSE;
3968 if (output_endian == ENDIAN_LITTLE){
3969 (*o_putc)((unsigned char)'\377');
3973 (*o_putc)((unsigned char)'\377');
3982 if (c2 == ISO_8859_1) {
3985 #ifdef NUMCHAR_OPTION
3986 } else if (c2 == 0 && is_unicode_capsule(c1)) {
3987 if (is_unicode_bmp(c1)) {
3988 c2 = (c1 >> 8) & 0xff;
3992 if (c1 <= UNICODE_MAX) {
3993 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
3994 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
3995 if (output_endian == ENDIAN_LITTLE){
3996 (*o_putc)(c2 & 0xff);
3997 (*o_putc)((c2 >> 8) & 0xff);
3998 (*o_putc)(c1 & 0xff);
3999 (*o_putc)((c1 >> 8) & 0xff);
4001 (*o_putc)((c2 >> 8) & 0xff);
4002 (*o_putc)(c2 & 0xff);
4003 (*o_putc)((c1 >> 8) & 0xff);
4004 (*o_putc)(c1 & 0xff);
4011 nkf_char val = e2w_conv(c2, c1);
4012 c2 = (val >> 8) & 0xff;
4016 if (output_endian == ENDIAN_LITTLE){
4025 void w_oconv32(nkf_char c2, nkf_char c1)
4028 output_bom_f = FALSE;
4029 if (output_endian == ENDIAN_LITTLE){
4030 (*o_putc)((unsigned char)'\377');
4038 (*o_putc)((unsigned char)'\377');
4047 if (c2 == ISO_8859_1) {
4049 #ifdef NUMCHAR_OPTION
4050 } else if (c2 == 0 && is_unicode_capsule(c1)) {
4054 c1 = e2w_conv(c2, c1);
4057 if (output_endian == ENDIAN_LITTLE){
4058 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4059 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4060 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4064 (*o_putc)((c1 & NKF_INT32_C(0x00FF0000)) >> 16);
4065 (*o_putc)((c1 & NKF_INT32_C(0x0000FF00)) >> 8);
4066 (*o_putc)( c1 & NKF_INT32_C(0x000000FF));
4071 void e_oconv(nkf_char c2, nkf_char c1)
4073 #ifdef NUMCHAR_OPTION
4074 if (c2 == 0 && is_unicode_capsule(c1)){
4075 w16e_conv(c1, &c2, &c1);
4076 if (c2 == 0 && is_unicode_capsule(c1)){
4077 c2 = c1 & VALUE_MASK;
4078 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
4082 c2 += c2 < 10 ? 0x75 : 0x8FEB;
4083 c1 = 0x21 + c1 % 94;
4086 (*o_putc)((c2 & 0x7f) | 0x080);
4087 (*o_putc)(c1 | 0x080);
4089 (*o_putc)((c2 & 0x7f) | 0x080);
4090 (*o_putc)(c1 | 0x080);
4094 if (encode_fallback) (*encode_fallback)(c1);
4103 } else if (c2 == 0) {
4104 output_mode = ASCII;
4106 } else if (c2 == JIS_X_0201) {
4107 output_mode = EUC_JP;
4108 (*o_putc)(SSO); (*o_putc)(c1|0x80);
4109 } else if (c2 == ISO_8859_1) {
4110 output_mode = ISO_8859_1;
4111 (*o_putc)(c1 | 0x080);
4113 } else if (is_eucg3(c2)){
4114 output_mode = EUC_JP;
4115 #ifdef SHIFTJIS_CP932
4118 if (e2s_conv(c2, c1, &s2, &s1) == 0){
4119 s2e_conv(s2, s1, &c2, &c1);
4124 output_mode = ASCII;
4126 }else if (is_eucg3(c2)){
4129 (*o_putc)((c2 & 0x7f) | 0x080);
4130 (*o_putc)(c1 | 0x080);
4133 (*o_putc)((c2 & 0x7f) | 0x080);
4134 (*o_putc)(c1 | 0x080);
4138 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
4139 set_iconv(FALSE, 0);
4140 return; /* too late to rescue this char */
4142 output_mode = EUC_JP;
4143 (*o_putc)(c2 | 0x080);
4144 (*o_putc)(c1 | 0x080);
4149 nkf_char x0212_shift(nkf_char c)
4154 if (0x75 <= c && c <= 0x7f){
4155 ret = c + (0x109 - 0x75);
4158 if (0x75 <= c && c <= 0x7f){
4159 ret = c + (0x113 - 0x75);
4166 nkf_char x0212_unshift(nkf_char c)
4169 if (0x7f <= c && c <= 0x88){
4170 ret = c + (0x75 - 0x7f);
4171 }else if (0x89 <= c && c <= 0x92){
4172 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
4176 #endif /* X0212_ENABLE */
4178 nkf_char e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
4184 if((0x21 <= ndx && ndx <= 0x2F)){
4185 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
4186 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4188 }else if(0x6E <= ndx && ndx <= 0x7E){
4189 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
4190 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4196 else if(nkf_isgraph(ndx)){
4198 const unsigned short *ptr;
4199 ptr = x0212_shiftjis[ndx - 0x21];
4201 val = ptr[(c1 & 0x7f) - 0x21];
4210 c2 = x0212_shift(c2);
4212 #endif /* X0212_ENABLE */
4214 if(0x7F < c2) return 1;
4215 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
4216 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
4220 void s_oconv(nkf_char c2, nkf_char c1)
4222 #ifdef NUMCHAR_OPTION
4223 if (c2 == 0 && is_unicode_capsule(c1)){
4224 w16e_conv(c1, &c2, &c1);
4225 if (c2 == 0 && is_unicode_capsule(c1)){
4226 c2 = c1 & VALUE_MASK;
4227 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
4230 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
4232 c1 += 0x40 + (c1 > 0x3e);
4237 if(encode_fallback)(*encode_fallback)(c1);
4246 } else if (c2 == 0) {
4247 output_mode = ASCII;
4249 } else if (c2 == JIS_X_0201) {
4250 output_mode = SHIFT_JIS;
4252 } else if (c2 == ISO_8859_1) {
4253 output_mode = ISO_8859_1;
4254 (*o_putc)(c1 | 0x080);
4256 } else if (is_eucg3(c2)){
4257 output_mode = SHIFT_JIS;
4258 if (e2s_conv(c2, c1, &c2, &c1) == 0){
4264 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
4265 set_iconv(FALSE, 0);
4266 return; /* too late to rescue this char */
4268 output_mode = SHIFT_JIS;
4269 e2s_conv(c2, c1, &c2, &c1);
4271 #ifdef SHIFTJIS_CP932
4273 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
4274 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
4280 #endif /* SHIFTJIS_CP932 */
4283 if (prefix_table[(unsigned char)c1]){
4284 (*o_putc)(prefix_table[(unsigned char)c1]);
4290 void j_oconv(nkf_char c2, nkf_char c1)
4292 #ifdef NUMCHAR_OPTION
4293 if (c2 == 0 && is_unicode_capsule(c1)){
4294 w16e_conv(c1, &c2, &c1);
4295 if (c2 == 0 && is_unicode_capsule(c1)){
4296 c2 = c1 & VALUE_MASK;
4297 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
4300 c2 = 0x7F + c1 / 94;
4301 c1 = 0x21 + c1 % 94;
4303 if (encode_fallback) (*encode_fallback)(c1);
4310 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4313 (*o_putc)(ascii_intro);
4314 output_mode = ASCII;
4318 } else if (is_eucg3(c2)){
4320 if(output_mode!=JIS_X_0213_2){
4321 output_mode = JIS_X_0213_2;
4328 if(output_mode!=JIS_X_0212){
4329 output_mode = JIS_X_0212;
4336 (*o_putc)(c2 & 0x7f);
4339 } else if (c2==JIS_X_0201) {
4340 if (output_mode!=JIS_X_0201) {
4341 output_mode = JIS_X_0201;
4347 } else if (c2==ISO_8859_1) {
4348 /* iso8859 introduction, or 8th bit on */
4349 /* Can we convert in 7bit form using ESC-'-'-A ?
4351 output_mode = ISO_8859_1;
4353 } else if (c2 == 0) {
4354 if (output_mode !=ASCII && output_mode!=ISO_8859_1) {
4357 (*o_putc)(ascii_intro);
4358 output_mode = ASCII;
4363 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
4364 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
4366 if (output_mode!=JIS_X_0213_1) {
4367 output_mode = JIS_X_0213_1;
4373 }else if (output_mode != JIS_X_0208) {
4374 output_mode = JIS_X_0208;
4377 (*o_putc)(kanji_intro);
4384 void base64_conv(nkf_char c2, nkf_char c1)
4386 mime_prechar(c2, c1);
4387 (*o_base64conv)(c2,c1);
4391 static nkf_char broken_buf[3];
4392 static int broken_counter = 0;
4393 static int broken_last = 0;
4394 nkf_char broken_getc(FILE *f)
4398 if (broken_counter>0) {
4399 return broken_buf[--broken_counter];
4402 if (c=='$' && broken_last != ESC
4403 && (input_mode==ASCII || input_mode==JIS_X_0201)) {
4406 if (c1=='@'|| c1=='B') {
4407 broken_buf[0]=c1; broken_buf[1]=c;
4414 } else if (c=='(' && broken_last != ESC
4415 && (input_mode==JIS_X_0208 || input_mode==JIS_X_0201)) { /* ) */
4418 if (c1=='J'|| c1=='B') {
4419 broken_buf[0]=c1; broken_buf[1]=c;
4432 nkf_char broken_ungetc(nkf_char c, FILE *f)
4434 if (broken_counter<2)
4435 broken_buf[broken_counter++]=c;
4439 void nl_conv(nkf_char c2, nkf_char c1)
4441 if (guess_f && input_newline != EOF) {
4442 if (c2 == 0 && c1 == LF) {
4443 if (!input_newline) input_newline = prev_cr ? CRLF : LF;
4444 else if (input_newline != (prev_cr ? CRLF : LF)) input_newline = EOF;
4445 } else if (c2 == 0 && c1 == CR && input_newline == LF) input_newline = EOF;
4447 else if (!input_newline) input_newline = CR;
4448 else if (input_newline != CR) input_newline = EOF;
4450 if (prev_cr || (c2 == 0 && c1 == LF)) {
4452 if (nlmode_f != LF) (*o_nlconv)(0, CR);
4453 if (nlmode_f != CR) (*o_nlconv)(0, LF);
4455 if (c2 == 0 && c1 == CR) prev_cr = CR;
4456 else if (c2 != 0 || c1 != LF) (*o_nlconv)(c2, c1);
4460 Return value of fold_conv()
4462 LF add newline and output char
4463 CR add newline and output nothing
4466 1 (or else) normal output
4468 fold state in prev (previous character)
4470 >0x80 Japanese (X0208/X0201)
4475 This fold algorthm does not preserve heading space in a line.
4476 This is the main difference from fmt.
4479 #define char_size(c2,c1) (c2?2:1)
4481 void fold_conv(nkf_char c2, nkf_char c1)
4484 nkf_char fold_state;
4486 if (c1== CR && !fold_preserve_f) {
4487 fold_state=0; /* ignore cr */
4488 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
4490 fold_state=0; /* ignore cr */
4491 } else if (c1== BS) {
4492 if (f_line>0) f_line--;
4494 } else if (c2==EOF && f_line != 0) { /* close open last line */
4496 } else if ((c1==LF && !fold_preserve_f)
4497 || ((c1==CR||(c1==LF&&f_prev!=CR))
4498 && fold_preserve_f)) {
4500 if (fold_preserve_f) {
4504 } else if ((f_prev == c1 && !fold_preserve_f)
4505 || (f_prev == LF && fold_preserve_f)
4506 ) { /* duplicate newline */
4509 fold_state = LF; /* output two newline */
4515 if (f_prev&0x80) { /* Japanese? */
4517 fold_state = 0; /* ignore given single newline */
4518 } else if (f_prev==SP) {
4522 if (++f_line<=fold_len)
4526 fold_state = CR; /* fold and output nothing */
4530 } else if (c1=='\f') {
4533 fold_state = LF; /* output newline and clear */
4534 } else if ( (c2==0 && c1==SP)||
4535 (c2==0 && c1==TAB)||
4536 (c2=='!'&& c1=='!')) {
4537 /* X0208 kankaku or ascii space */
4539 fold_state = 0; /* remove duplicate spaces */
4542 if (++f_line<=fold_len)
4543 fold_state = SP; /* output ASCII space only */
4545 f_prev = SP; f_line = 0;
4546 fold_state = CR; /* fold and output nothing */
4550 prev0 = f_prev; /* we still need this one... , but almost done */
4552 if (c2 || c2==JIS_X_0201)
4553 f_prev |= 0x80; /* this is Japanese */
4554 f_line += char_size(c2,c1);
4555 if (f_line<=fold_len) { /* normal case */
4558 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
4559 f_line = char_size(c2,c1);
4560 fold_state = LF; /* We can't wait, do fold now */
4561 } else if (c2==JIS_X_0201) {
4562 /* simple kinsoku rules return 1 means no folding */
4563 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
4564 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
4565 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
4566 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
4567 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
4568 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
4569 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
4571 fold_state = LF;/* add one new f_line before this character */
4574 fold_state = LF;/* add one new f_line before this character */
4577 /* kinsoku point in ASCII */
4578 if ( c1==')'|| /* { [ ( */
4589 /* just after special */
4590 } else if (!is_alnum(prev0)) {
4591 f_line = char_size(c2,c1);
4593 } else if ((prev0==SP) || /* ignored new f_line */
4594 (prev0==LF)|| /* ignored new f_line */
4595 (prev0&0x80)) { /* X0208 - ASCII */
4596 f_line = char_size(c2,c1);
4597 fold_state = LF;/* add one new f_line before this character */
4599 fold_state = 1; /* default no fold in ASCII */
4603 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
4604 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
4605 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
4606 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
4607 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
4608 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
4609 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
4610 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
4611 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
4612 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
4613 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
4614 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
4615 /* default no fold in kinsoku */
4618 f_line = char_size(c2,c1);
4619 /* add one new f_line before this character */
4622 f_line = char_size(c2,c1);
4624 /* add one new f_line before this character */
4629 /* terminator process */
4630 switch(fold_state) {
4632 OCONV_NEWLINE((*o_fconv));
4638 OCONV_NEWLINE((*o_fconv));
4649 nkf_char z_prev2=0,z_prev1=0;
4651 void z_conv(nkf_char c2, nkf_char c1)
4654 /* if (c2) c1 &= 0x7f; assertion */
4656 if (c2 == JIS_X_0201 && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4662 if (z_prev2 == JIS_X_0201) {
4663 if (c2 == JIS_X_0201) {
4664 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
4666 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4668 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
4670 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4675 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4677 if (c2 == JIS_X_0201) {
4678 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
4679 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
4684 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4695 if (alpha_f&1 && c2 == 0x23) {
4696 /* JISX0208 Alphabet */
4698 } else if (c2 == 0x21) {
4699 /* JISX0208 Kigou */
4704 } else if (alpha_f&4) {
4709 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4715 if (alpha_f&8 && c2 == 0) {
4719 case '>': entity = ">"; break;
4720 case '<': entity = "<"; break;
4721 case '\"': entity = """; break;
4722 case '&': entity = "&"; break;
4725 while (*entity) (*o_zconv)(0, *entity++);
4731 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4736 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4740 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4744 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4748 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4752 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4756 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4760 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4764 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4769 (*o_zconv)(JIS_X_0201, c);
4772 } else if (c2 == 0x25) {
4773 /* JISX0208 Katakana */
4774 static const int fullwidth_to_halfwidth[] =
4776 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4777 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4778 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4779 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4780 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4781 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4782 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4783 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4784 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4785 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4786 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
4787 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
4789 if (fullwidth_to_halfwidth[c1-0x20]){
4790 c2 = fullwidth_to_halfwidth[c1-0x20];
4791 (*o_zconv)(JIS_X_0201, c2>>8);
4793 (*o_zconv)(JIS_X_0201, c2&0xFF);
4803 #define rot13(c) ( \
4805 (c <= 'M') ? (c + 13): \
4806 (c <= 'Z') ? (c - 13): \
4808 (c <= 'm') ? (c + 13): \
4809 (c <= 'z') ? (c - 13): \
4813 #define rot47(c) ( \
4815 ( c <= 'O') ? (c + 47) : \
4816 ( c <= '~') ? (c - 47) : \
4820 void rot_conv(nkf_char c2, nkf_char c1)
4822 if (c2==0 || c2==JIS_X_0201 || c2==ISO_8859_1) {
4828 (*o_rot_conv)(c2,c1);
4831 void hira_conv(nkf_char c2, nkf_char c1)
4835 if (0x20 < c1 && c1 < 0x74) {
4837 (*o_hira_conv)(c2,c1);
4839 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
4841 c1 = CLASS_UNICODE | 0x3094;
4842 (*o_hira_conv)(c2,c1);
4845 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4847 (*o_hira_conv)(c2,c1);
4852 if (c2 == 0 && c1 == (CLASS_UNICODE | 0x3094)) {
4855 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4857 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4861 (*o_hira_conv)(c2,c1);
4865 void iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4867 static const nkf_char range[RANGE_NUM_MAX][2] = {
4888 nkf_char start, end, c;
4890 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4894 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4899 for (i = 0; i < RANGE_NUM_MAX; i++) {
4900 start = range[i][0];
4903 if (c >= start && c <= end) {
4908 (*o_iso2022jp_check_conv)(c2,c1);
4912 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4914 static const unsigned char *mime_pattern[] = {
4915 (const unsigned char *)"\075?EUC-JP?B?",
4916 (const unsigned char *)"\075?SHIFT_JIS?B?",
4917 (const unsigned char *)"\075?ISO-8859-1?Q?",
4918 (const unsigned char *)"\075?ISO-8859-1?B?",
4919 (const unsigned char *)"\075?ISO-2022-JP?B?",
4920 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4921 #if defined(UTF8_INPUT_ENABLE)
4922 (const unsigned char *)"\075?UTF-8?B?",
4923 (const unsigned char *)"\075?UTF-8?Q?",
4925 (const unsigned char *)"\075?US-ASCII?Q?",
4930 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
4931 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4932 e_iconv, s_iconv, 0, 0, 0, 0,
4933 #if defined(UTF8_INPUT_ENABLE)
4939 static const nkf_char mime_encode[] = {
4940 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201,
4941 #if defined(UTF8_INPUT_ENABLE)
4948 static const nkf_char mime_encode_method[] = {
4949 'B', 'B','Q', 'B', 'B', 'Q',
4950 #if defined(UTF8_INPUT_ENABLE)
4958 #define MAXRECOVER 20
4960 void switch_mime_getc(void)
4962 if (i_getc!=mime_getc) {
4963 i_mgetc = i_getc; i_getc = mime_getc;
4964 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4965 if(mime_f==STRICT_MIME) {
4966 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4967 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4972 void unswitch_mime_getc(void)
4974 if(mime_f==STRICT_MIME) {
4975 i_mgetc = i_mgetc_buf;
4976 i_mungetc = i_mungetc_buf;
4979 i_ungetc = i_mungetc;
4980 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4981 mime_iconv_back = NULL;
4984 nkf_char mime_begin_strict(FILE *f)
4988 const unsigned char *p,*q;
4989 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4991 mime_decode_mode = FALSE;
4992 /* =? has been checked */
4994 p = mime_pattern[j];
4997 for(i=2;p[i]>SP;i++) { /* start at =? */
4998 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4999 /* pattern fails, try next one */
5001 while (mime_pattern[++j]) {
5002 p = mime_pattern[j];
5003 for(k=2;k<i;k++) /* assume length(p) > i */
5004 if (p[k]!=q[k]) break;
5005 if (k==i && nkf_toupper(c1)==p[k]) break;
5007 p = mime_pattern[j];
5008 if (p) continue; /* found next one, continue */
5009 /* all fails, output from recovery buffer */
5017 mime_decode_mode = p[i-2];
5019 mime_iconv_back = iconv;
5020 set_iconv(FALSE, mime_priority_func[j]);
5021 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
5023 if (mime_decode_mode=='B') {
5024 mimebuf_f = unbuf_f;
5026 /* do MIME integrity check */
5027 return mime_integrity(f,mime_pattern[j]);
5035 nkf_char mime_getc_buf(FILE *f)
5037 /* we don't keep eof of Fifo, becase it contains ?= as
5038 a terminator. It was checked in mime_integrity. */
5039 return ((mimebuf_f)?
5040 (*i_mgetc_buf)(f):Fifo(mime_input++));
5043 nkf_char mime_ungetc_buf(nkf_char c, FILE *f)
5046 (*i_mungetc_buf)(c,f);
5048 Fifo(--mime_input) = (unsigned char)c;
5052 nkf_char mime_begin(FILE *f)
5057 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
5058 /* re-read and convert again from mime_buffer. */
5060 /* =? has been checked */
5062 Fifo(mime_last++)='='; Fifo(mime_last++)='?';
5063 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
5064 /* We accept any character type even if it is breaked by new lines */
5065 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5066 if (c1==LF||c1==SP||c1==CR||
5067 c1=='-'||c1=='_'||is_alnum(c1)) continue;
5069 /* Failed. But this could be another MIME preemble */
5077 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5078 if (!(++i<MAXRECOVER) || c1==EOF) break;
5079 if (c1=='b'||c1=='B') {
5080 mime_decode_mode = 'B';
5081 } else if (c1=='q'||c1=='Q') {
5082 mime_decode_mode = 'Q';
5086 c1 = (*i_getc)(f); Fifo(mime_last++) = (unsigned char)c1;
5087 if (!(++i<MAXRECOVER) || c1==EOF) break;
5089 mime_decode_mode = FALSE;
5095 if (!mime_decode_mode) {
5096 /* false MIME premble, restart from mime_buffer */
5097 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
5098 /* Since we are in MIME mode until buffer becomes empty, */
5099 /* we never go into mime_begin again for a while. */
5102 /* discard mime preemble, and goto MIME mode */
5104 /* do no MIME integrity check */
5105 return c1; /* used only for checking EOF */
5109 void no_putc(nkf_char c)
5114 void debug(const char *str)
5117 fprintf(stderr, "%s\n", str ? str : "NULL");
5122 void set_input_codename(char *codename)
5124 if (!input_codename) {
5125 input_codename = codename;
5126 } else if (strcmp(codename, input_codename) != 0) {
5127 input_codename = "";
5131 static char* get_guessed_code(void)
5133 if (input_codename && !*input_codename) {
5134 input_codename = "BINARY";
5136 struct input_code *p = find_inputcode_byfunc(iconv);
5137 if (!input_codename) {
5138 input_codename = "ASCII";
5139 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
5140 if (p->score & (SCORE_DEPEND|SCORE_CP932))
5141 input_codename = "CP932";
5142 } else if (strcmp(input_codename, "EUC-JP") == 0) {
5143 if (p->score & (SCORE_X0212))
5144 input_codename = "EUCJP-MS";
5145 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5146 input_codename = "CP51932";
5147 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
5148 if (p->score & (SCORE_KANA))
5149 input_codename = "CP50221";
5150 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
5151 input_codename = "CP50220";
5154 return input_codename;
5157 #if !defined(PERL_XS) && !defined(WIN32DLL)
5158 void print_guessed_code(char *filename)
5160 if (filename != NULL) printf("%s: ", filename);
5161 if (input_codename && !*input_codename) {
5164 input_codename = get_guessed_code();
5166 printf("%s\n", input_codename);
5170 input_newline == CR ? " (CR)" :
5171 input_newline == LF ? " (LF)" :
5172 input_newline == CRLF ? " (CRLF)" :
5173 input_newline == EOF ? " (MIXED NL)" :
5182 nkf_char hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
5184 nkf_char c1, c2, c3;
5190 if (!nkf_isxdigit(c2)){
5195 if (!nkf_isxdigit(c3)){
5200 return (hex2bin(c2) << 4) | hex2bin(c3);
5203 nkf_char cap_getc(FILE *f)
5205 return hex_getc(':', f, i_cgetc, i_cungetc);
5208 nkf_char cap_ungetc(nkf_char c, FILE *f)
5210 return (*i_cungetc)(c, f);
5213 nkf_char url_getc(FILE *f)
5215 return hex_getc('%', f, i_ugetc, i_uungetc);
5218 nkf_char url_ungetc(nkf_char c, FILE *f)
5220 return (*i_uungetc)(c, f);
5224 #ifdef NUMCHAR_OPTION
5225 nkf_char numchar_getc(FILE *f)
5227 nkf_char (*g)(FILE *) = i_ngetc;
5228 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
5239 if (buf[i] == 'x' || buf[i] == 'X'){
5240 for (j = 0; j < 7; j++){
5242 if (!nkf_isxdigit(buf[i])){
5249 c |= hex2bin(buf[i]);
5252 for (j = 0; j < 8; j++){
5256 if (!nkf_isdigit(buf[i])){
5263 c += hex2bin(buf[i]);
5269 return CLASS_UNICODE | c;
5278 nkf_char numchar_ungetc(nkf_char c, FILE *f)
5280 return (*i_nungetc)(c, f);
5284 #ifdef UNICODE_NORMALIZATION
5286 /* Normalization Form C */
5287 nkf_char nfc_getc(FILE *f)
5289 nkf_char (*g)(FILE *f) = i_nfc_getc;
5290 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
5291 int i=0, j, k=1, lower, upper;
5293 const unsigned char *array;
5296 while (k > 0 && ((buf[i] & 0xc0) != 0x80)){
5297 lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
5298 while (upper >= lower) {
5299 j = (lower+upper) / 2;
5300 array = normalization_table[j].nfd;
5301 for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){
5302 if (array[k] != buf[k]){
5303 array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1);
5310 array = normalization_table[j].nfc;
5311 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
5312 buf[i] = (nkf_char)(array[i]);
5323 nkf_char nfc_ungetc(nkf_char c, FILE *f)
5325 return (*i_nfc_ungetc)(c, f);
5327 #endif /* UNICODE_NORMALIZATION */
5333 nkf_char c1, c2, c3, c4, cc;
5334 nkf_char t1, t2, t3, t4, mode, exit_mode;
5335 nkf_char lwsp_count;
5338 nkf_char lwsp_size = 128;
5340 if (mime_top != mime_last) { /* Something is in FIFO */
5341 return Fifo(mime_top++);
5343 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
5344 mime_decode_mode=FALSE;
5345 unswitch_mime_getc();
5346 return (*i_getc)(f);
5349 if (mimebuf_f == FIXED_MIME)
5350 exit_mode = mime_decode_mode;
5353 if (mime_decode_mode == 'Q') {
5354 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5356 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
5357 if (c1<=SP || DEL<=c1) {
5358 mime_decode_mode = exit_mode; /* prepare for quit */
5361 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
5365 mime_decode_mode = exit_mode; /* prepare for quit */
5366 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
5367 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
5368 /* end Q encoding */
5369 input_mode = exit_mode;
5371 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5372 if (lwsp_buf==NULL) {
5373 perror("can't malloc");
5376 while ((c1=(*i_getc)(f))!=EOF) {
5381 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5389 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
5390 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5405 lwsp_buf[lwsp_count] = (unsigned char)c1;
5406 if (lwsp_count++>lwsp_size){
5408 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5409 if (lwsp_buf_new==NULL) {
5411 perror("can't realloc");
5414 lwsp_buf = lwsp_buf_new;
5420 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5422 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5423 i_ungetc(lwsp_buf[lwsp_count],f);
5429 if (c1=='='&&c2<SP) { /* this is soft wrap */
5430 while((c1 = (*i_mgetc)(f)) <=SP) {
5431 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
5433 mime_decode_mode = 'Q'; /* still in MIME */
5434 goto restart_mime_q;
5437 mime_decode_mode = 'Q'; /* still in MIME */
5441 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
5442 if (c2<=SP) return c2;
5443 mime_decode_mode = 'Q'; /* still in MIME */
5444 return ((hex2bin(c2)<<4) + hex2bin(c3));
5447 if (mime_decode_mode != 'B') {
5448 mime_decode_mode = FALSE;
5449 return (*i_mgetc)(f);
5453 /* Base64 encoding */
5455 MIME allows line break in the middle of
5456 Base64, but we are very pessimistic in decoding
5457 in unbuf mode because MIME encoded code may broken by
5458 less or editor's control sequence (such as ESC-[-K in unbuffered
5459 mode. ignore incomplete MIME.
5461 mode = mime_decode_mode;
5462 mime_decode_mode = exit_mode; /* prepare for quit */
5464 while ((c1 = (*i_mgetc)(f))<=SP) {
5469 if ((c2 = (*i_mgetc)(f))<=SP) {
5472 if (mime_f != STRICT_MIME) goto mime_c2_retry;
5473 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5476 if ((c1 == '?') && (c2 == '=')) {
5479 lwsp_buf = malloc((lwsp_size+5)*sizeof(char));
5480 if (lwsp_buf==NULL) {
5481 perror("can't malloc");
5484 while ((c1=(*i_getc)(f))!=EOF) {
5489 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5497 if ((c1=(*i_getc)(f))!=EOF) {
5501 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
5516 lwsp_buf[lwsp_count] = (unsigned char)c1;
5517 if (lwsp_count++>lwsp_size){
5519 lwsp_buf_new = realloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
5520 if (lwsp_buf_new==NULL) {
5522 perror("can't realloc");
5525 lwsp_buf = lwsp_buf_new;
5531 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5533 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5534 i_ungetc(lwsp_buf[lwsp_count],f);
5541 if ((c3 = (*i_mgetc)(f))<=SP) {
5544 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5545 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5549 if ((c4 = (*i_mgetc)(f))<=SP) {
5552 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5553 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5557 mime_decode_mode = mode; /* still in MIME sigh... */
5559 /* BASE 64 decoding */
5561 t1 = 0x3f & base64decode(c1);
5562 t2 = 0x3f & base64decode(c2);
5563 t3 = 0x3f & base64decode(c3);
5564 t4 = 0x3f & base64decode(c4);
5565 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5567 Fifo(mime_last++) = (unsigned char)cc;
5568 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5570 Fifo(mime_last++) = (unsigned char)cc;
5571 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5573 Fifo(mime_last++) = (unsigned char)cc;
5578 return Fifo(mime_top++);
5581 nkf_char mime_ungetc(nkf_char c, FILE *f)
5583 Fifo(--mime_top) = (unsigned char)c;
5587 nkf_char mime_integrity(FILE *f, const unsigned char *p)
5591 /* In buffered mode, read until =? or NL or buffer full
5593 mime_input = mime_top;
5594 mime_last = mime_top;
5596 while(*p) Fifo(mime_input++) = *p++;
5599 while((c=(*i_getc)(f))!=EOF) {
5600 if (((mime_input-mime_top)&MIME_BUF_MASK)==0) {
5601 break; /* buffer full */
5603 if (c=='=' && d=='?') {
5604 /* checked. skip header, start decode */
5605 Fifo(mime_input++) = (unsigned char)c;
5606 /* mime_last_input = mime_input; */
5611 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
5613 /* Should we check length mod 4? */
5614 Fifo(mime_input++) = (unsigned char)c;
5617 /* In case of Incomplete MIME, no MIME decode */
5618 Fifo(mime_input++) = (unsigned char)c;
5619 mime_last = mime_input; /* point undecoded buffer */
5620 mime_decode_mode = 1; /* no decode on Fifo last in mime_getc */
5621 switch_mime_getc(); /* anyway we need buffered getc */
5625 nkf_char base64decode(nkf_char c)
5630 i = c - 'A'; /* A..Z 0-25 */
5631 } else if (c == '_') {
5632 i = '?' /* 63 */ ; /* _ 63 */
5634 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
5636 } else if (c > '/') {
5637 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
5638 } else if (c == '+' || c == '-') {
5639 i = '>' /* 62 */ ; /* + and - 62 */
5641 i = '?' /* 63 */ ; /* / 63 */
5646 static const char basis_64[] =
5647 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5649 static nkf_char b64c;
5650 #define MIMEOUT_BUF_LENGTH (60)
5651 char mimeout_buf[MIMEOUT_BUF_LENGTH+1];
5652 int mimeout_buf_count = 0;
5654 void open_mime(nkf_char mode)
5656 const unsigned char *p;
5659 p = mime_pattern[0];
5660 for(i=0;mime_pattern[i];i++) {
5661 if (mode == mime_encode[i]) {
5662 p = mime_pattern[i];
5666 mimeout_mode = mime_encode_method[i];
5668 if (base64_count>45) {
5669 if (mimeout_buf_count>0 && nkf_isblank(mimeout_buf[i])){
5670 (*o_mputc)(mimeout_buf[i]);
5673 PUT_NEWLINE((*o_mputc));
5676 if (mimeout_buf_count>0
5677 && (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5678 || mimeout_buf[i]==CR || mimeout_buf[i]==LF)) {
5682 for (;i<mimeout_buf_count;i++) {
5683 if (mimeout_buf[i]==SP || mimeout_buf[i]==TAB
5684 || mimeout_buf[i]==CR || mimeout_buf[i]==LF) {
5685 (*o_mputc)(mimeout_buf[i]);
5695 j = mimeout_buf_count;
5696 mimeout_buf_count = 0;
5698 mime_putc(mimeout_buf[i]);
5702 void close_mime(void)
5712 switch(mimeout_mode) {
5717 (*o_mputc)(basis_64[((b64c & 0x3)<< 4)]);
5723 (*o_mputc)(basis_64[((b64c & 0xF) << 2)]);
5728 if (mimeout_mode > 0) {
5729 if (mimeout_f!=FIXED_MIME) {
5731 } else if (mimeout_mode != 'Q')
5736 void mimeout_addchar(nkf_char c)
5738 switch(mimeout_mode) {
5743 } else if(!nkf_isalnum(c)) {
5745 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5746 (*o_mputc)(bin2hex((c&0xf)));
5755 (*o_mputc)(basis_64[c>>2]);
5760 (*o_mputc)(basis_64[((b64c & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5766 (*o_mputc)(basis_64[((b64c & 0xF) << 2) | ((c & 0xC0) >>6)]);
5767 (*o_mputc)(basis_64[c & 0x3F]);
5778 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5780 void mime_prechar(nkf_char c2, nkf_char c1)
5782 if (mimeout_mode > 0){
5784 if (base64_count + mimeout_buf_count/3*4> 73){
5785 (*o_base64conv)(EOF,0);
5786 OCONV_NEWLINE((*o_base64conv));
5787 (*o_base64conv)(0,SP);
5791 if (base64_count + mimeout_buf_count/3*4> 66) {
5792 (*o_base64conv)(EOF,0);
5793 OCONV_NEWLINE((*o_base64conv));
5794 (*o_base64conv)(0,SP);
5800 if (c2 != EOF && base64_count + mimeout_buf_count/3*4> 60) {
5801 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5802 open_mime(output_mode);
5803 (*o_base64conv)(EOF,0);
5804 OCONV_NEWLINE((*o_base64conv));
5805 (*o_base64conv)(0,SP);
5812 void mime_putc(nkf_char c)
5817 if (mimeout_f == FIXED_MIME){
5818 if (mimeout_mode == 'Q'){
5819 if (base64_count > 71){
5820 if (c!=CR && c!=LF) {
5822 PUT_NEWLINE((*o_mputc));
5827 if (base64_count > 71){
5829 PUT_NEWLINE((*o_mputc));
5832 if (c == EOF) { /* c==EOF */
5836 if (c != EOF) { /* c==EOF */
5842 /* mimeout_f != FIXED_MIME */
5844 if (c == EOF) { /* c==EOF */
5845 if (mimeout_mode == -1 && mimeout_buf_count > 1) open_mime(output_mode);
5846 j = mimeout_buf_count;
5847 mimeout_buf_count = 0;
5849 if (mimeout_mode > 0) {
5850 if (!nkf_isblank(mimeout_buf[j-1])) {
5852 if (nkf_isspace(mimeout_buf[i]) && base64_count < 71){
5855 mimeout_addchar(mimeout_buf[i]);
5859 mimeout_addchar(mimeout_buf[i]);
5863 mimeout_addchar(mimeout_buf[i]);
5869 mimeout_addchar(mimeout_buf[i]);
5875 if (mimeout_buf_count > 0){
5876 lastchar = mimeout_buf[mimeout_buf_count - 1];
5881 if (mimeout_mode=='Q') {
5882 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5883 if (c == CR || c == LF) {
5888 } else if (c <= SP) {
5890 if (base64_count > 70) {
5891 PUT_NEWLINE((*o_mputc));
5894 if (!nkf_isblank(c)) {
5899 if (base64_count > 70) {
5901 PUT_NEWLINE((*o_mputc));
5904 open_mime(output_mode);
5906 if (!nkf_noescape_mime(c)) {
5917 if (mimeout_mode <= 0) {
5918 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5919 if (nkf_isspace(c)) {
5921 if (mimeout_mode == -1) {
5924 if (c==CR || c==LF) {
5926 open_mime(output_mode);
5932 for (i=0;i<mimeout_buf_count;i++) {
5933 (*o_mputc)(mimeout_buf[i]);
5934 if (mimeout_buf[i] == CR || mimeout_buf[i] == LF){
5945 mimeout_buf[0] = (char)c;
5946 mimeout_buf_count = 1;
5948 if (base64_count > 1
5949 && base64_count + mimeout_buf_count > 76
5950 && mimeout_buf[0] != CR && mimeout_buf[0] != LF){
5951 PUT_NEWLINE((*o_mputc));
5953 if (!nkf_isspace(mimeout_buf[0])){
5958 mimeout_buf[mimeout_buf_count++] = (char)c;
5959 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
5960 open_mime(output_mode);
5965 if (lastchar==CR || lastchar == LF){
5966 for (i=0;i<mimeout_buf_count;i++) {
5967 (*o_mputc)(mimeout_buf[i]);
5970 mimeout_buf_count = 0;
5973 for (i=0;i<mimeout_buf_count-1;i++) {
5974 (*o_mputc)(mimeout_buf[i]);
5977 mimeout_buf[0] = SP;
5978 mimeout_buf_count = 1;
5980 open_mime(output_mode);
5983 /* mimeout_mode == 'B', 1, 2 */
5984 if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5985 if (lastchar == CR || lastchar == LF){
5986 if (nkf_isblank(c)) {
5987 for (i=0;i<mimeout_buf_count;i++) {
5988 mimeout_addchar(mimeout_buf[i]);
5990 mimeout_buf_count = 0;
5991 } else if (SP<c && c<DEL) {
5993 for (i=0;i<mimeout_buf_count;i++) {
5994 (*o_mputc)(mimeout_buf[i]);
5997 mimeout_buf_count = 0;
5999 mimeout_buf[mimeout_buf_count++] = (char)c;
6002 if (c==SP || c==TAB || c==CR || c==LF) {
6003 for (i=0;i<mimeout_buf_count;i++) {
6004 if (SP<mimeout_buf[i] && mimeout_buf[i]<DEL) {
6006 for (i=0;i<mimeout_buf_count;i++) {
6007 (*o_mputc)(mimeout_buf[i]);
6010 mimeout_buf_count = 0;
6013 mimeout_buf[mimeout_buf_count++] = (char)c;
6014 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6016 for (i=0;i<mimeout_buf_count;i++) {
6017 (*o_mputc)(mimeout_buf[i]);
6020 mimeout_buf_count = 0;
6024 if (mimeout_buf_count>0 && SP<c && c!='=') {
6025 mimeout_buf[mimeout_buf_count++] = (char)c;
6026 if (mimeout_buf_count>MIMEOUT_BUF_LENGTH) {
6027 j = mimeout_buf_count;
6028 mimeout_buf_count = 0;
6030 mimeout_addchar(mimeout_buf[i]);
6037 if (mimeout_buf_count>0) {
6038 j = mimeout_buf_count;
6039 mimeout_buf_count = 0;
6041 if (mimeout_buf[i]==CR || mimeout_buf[i]==LF)
6043 mimeout_addchar(mimeout_buf[i]);
6049 (*o_mputc)(mimeout_buf[i]);
6051 open_mime(output_mode);
6061 struct input_code *p = input_code_list;
6073 mime_f = MIME_DECODE_DEFAULT;
6074 mime_decode_f = FALSE;
6079 x0201_f = X0201_DEFAULT;
6080 iso2022jp_f = FALSE;
6081 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
6082 ms_ucs_map_f = UCS_MAP_ASCII;
6084 #ifdef UTF8_INPUT_ENABLE
6085 no_cp932ext_f = FALSE;
6086 no_best_fit_chars_f = FALSE;
6087 encode_fallback = NULL;
6088 unicode_subchar = '?';
6089 input_endian = ENDIAN_BIG;
6091 #ifdef UTF8_OUTPUT_ENABLE
6092 output_bom_f = FALSE;
6093 output_endian = ENDIAN_BIG;
6095 #ifdef UNICODE_NORMALIZATION
6111 #ifdef SHIFTJIS_CP932
6121 for (i = 0; i < 256; i++){
6122 prefix_table[i] = 0;
6126 mimeout_buf_count = 0;
6131 fold_preserve_f = FALSE;
6134 kanji_intro = DEFAULT_J;
6135 ascii_intro = DEFAULT_R;
6136 fold_margin = FOLD_MARGIN;
6137 o_zconv = no_connection;
6138 o_fconv = no_connection;
6139 o_nlconv = no_connection;
6140 o_rot_conv = no_connection;
6141 o_hira_conv = no_connection;
6142 o_base64conv = no_connection;
6143 o_iso2022jp_check_conv = no_connection;
6146 i_ungetc = std_ungetc;
6148 i_bungetc = std_ungetc;
6151 i_mungetc = std_ungetc;
6152 i_mgetc_buf = std_getc;
6153 i_mungetc_buf = std_ungetc;
6154 output_mode = ASCII;
6157 mime_decode_mode = FALSE;
6165 z_prev2=0,z_prev1=0;
6167 iconv_for_check = 0;
6169 input_codename = NULL;
6170 input_encoding = NULL;
6171 output_encoding = NULL;
6177 void no_connection(nkf_char c2, nkf_char c1)
6179 no_connection2(c2,c1,0);
6182 nkf_char no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
6184 fprintf(stderr,"nkf internal module connection failure.\n");
6186 return 0; /* LINT */
6191 #define fprintf dllprintf
6196 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
6201 fprintf(HELP_OUTPUT,
6202 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
6204 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
6205 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
6206 #ifdef UTF8_OUTPUT_ENABLE
6207 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
6209 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
6210 #ifdef UTF8_INPUT_ENABLE
6211 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
6214 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
6215 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
6216 "r {de/en}crypt ROT13/47\n"
6217 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
6218 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
6219 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
6220 "l ISO8859-1 (Latin-1) support\n"
6221 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
6222 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
6223 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
6224 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
6225 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
6226 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
6228 "T Text mode output\n"
6230 "O Output to File (DEFAULT 'nkf.out')\n"
6231 "I Convert non ISO-2022-JP charactor to GETA\n"
6232 "d,c Convert line breaks -d: LF -c: CRLF\n"
6233 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
6234 "v, V Show this usage. V: show configuration\n"
6236 "Long name options\n"
6237 " --ic=<input codeset> --oc=<output codeset>\n"
6238 " Specify the input or output codeset\n"
6239 " --fj --unix --mac --windows\n"
6240 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
6241 " Convert for the system or code\n"
6242 " --hiragana --katakana --katakana-hiragana\n"
6243 " To Hiragana/Katakana Conversion\n"
6244 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
6246 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
6248 #ifdef NUMCHAR_OPTION
6249 " --numchar-input Convert Unicode Character Reference\n"
6251 #ifdef UTF8_INPUT_ENABLE
6252 " --fb-{skip, html, xml, perl, java, subchar}\n"
6253 " Specify how nkf handles unassigned characters\n"
6256 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
6257 " Overwrite original listed files by filtered result\n"
6258 " --overwrite preserves timestamp of original files\n"
6260 " -g --guess Guess the input code\n"
6261 " --help --version Show this help/the version\n"
6262 " For more information, see also man nkf\n"
6267 void show_configuration(void)
6269 fprintf(HELP_OUTPUT,
6270 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
6273 " Compile-time options:\n"
6274 " Compiled at: " __DATE__ " " __TIME__ "\n"
6276 fprintf(HELP_OUTPUT,
6277 " Default output encoding: "
6278 #ifdef DEFAULT_ENCIDX
6279 "%s\n", nkf_enc_name(nkf_default_encoding())
6281 "%s (%s)\n", nkf_locale_encoding() ? "LOCALE" : "DEFAULT",
6282 nkf_enc_name(nkf_default_encoding())
6285 fprintf(HELP_OUTPUT,
6286 " Default output newline: "
6287 #if DEFAULT_NEWLINE == CR
6289 #elif DEFAULT_NEWLINE == CRLF
6295 " Decode MIME encoded string: "
6296 #if MIME_DECODE_DEFAULT
6302 " Convert JIS X 0201 Katakana: "
6309 " --help, --version output: "
6310 #if HELP_OUTPUT_HELP_OUTPUT