1 /** Network Kanji Filter. (PDS Version)
2 ** -*- coding: ISO-2022-JP -*-
3 ************************************************************************
4 ** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA)
5 **
\e$BO"Mm@h!'
\e(B
\e$B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j
\e(B
6 **
\e$B!J
\e(BE-Mail Address: ichikawa@flab.fujitsu.co.jp
\e$B!K
\e(B
7 ** Copyright (C) 1996,1998
9 **
\e$BO"Mm@h!'
\e(B
\e$BN05eBg3X>pJs9)3X2J
\e(B
\e$B2OLn
\e(B
\e$B??<#
\e(B mime/X0208 support
10 **
\e$B!J
\e(BE-Mail Address: kono@ie.u-ryukyu.ac.jp
\e$B!K
\e(B
11 **
\e$BO"Mm@h!'
\e(B COW for DOS & Win16 & Win32 & OS/2
12 **
\e$B!J
\e(BE-Mail Address: GHG00637@niftyserve.or.p
\e$B!K
\e(B
14 **
\e$B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"
\e(B
15 **
\e$B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#
\e(B
16 **
\e$B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#
\e(B
17 **
\e$B1DMxMxMQ$b>e5-$KH?$7$J$$HO0O$G5v2D$7$^$9!#
\e(B
18 **
\e$B%P%$%J%j$NG[I[$N:]$K$O
\e(Bversion message
\e$B$rJ]B8$9$k$3$H$r>r7o$H$7$^$9!#
\e(B
19 **
\e$B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#
\e(B
21 ** Everyone is permitted to do anything on this program
22 ** including copying, modifying, improving,
23 ** as long as you don't try to pretend that you wrote it.
24 ** i.e., the above copyright notice has to appear in all copies.
25 ** Binary distribution requires original version messages.
26 ** You don't have to ask before copying, redistribution or publishing.
27 ** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE.
28 ***********************************************************************/
30 /***********************************************************************
31 *
\e$B8=:_!"
\e(Bnkf
\e$B$O
\e(B SorceForge
\e$B$K$F%a%s%F%J%s%9$,B3$1$i$l$F$$$^$9!#
\e(B
32 * http://sourceforge.jp/projects/nkf/
33 ***********************************************************************/
34 #define NKF_VERSION "2.0.8"
35 #define NKF_RELEASE_DATE "2009-01-05"
37 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),2000 S. Kono, COW\n" \
38 "Copyright (C) 2002-2009 Kono, Furukawa, Naruse, mastodon"
49 # define INCL_DOSERRORS
55 /* state of output_mode and input_mode
134 NKF_ENCODING_TABLE_SIZE,
135 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
136 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
137 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
138 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
139 JIS_X_0208 = 0x1168, /* @B */
140 JIS_X_0212 = 0x1159, /* D */
141 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
142 JIS_X_0213_2 = 0x1229, /* P */
143 JIS_X_0213_1 = 0x1233 /* Q */
146 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
147 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
148 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
149 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
150 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
151 static void j_oconv(nkf_char c2, nkf_char c1);
152 static void s_oconv(nkf_char c2, nkf_char c1);
153 static void e_oconv(nkf_char c2, nkf_char c1);
154 static void w_oconv(nkf_char c2, nkf_char c1);
155 static void w_oconv16(nkf_char c2, nkf_char c1);
156 static void w_oconv32(nkf_char c2, nkf_char c1);
160 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
161 void (*oconv)(nkf_char c2, nkf_char c1);
162 } nkf_native_encoding;
164 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
165 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
166 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
167 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
168 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
169 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
170 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
175 const nkf_native_encoding *base_encoding;
178 nkf_encoding nkf_encoding_table[] = {
179 {ASCII, "US-ASCII", &NkfEncodingASCII},
180 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
181 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
182 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
183 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
184 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
185 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
186 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
187 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
188 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
189 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
190 {CP10001, "CP10001", &NkfEncodingShift_JIS},
191 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
192 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
193 {CP51932, "CP51932", &NkfEncodingEUC_JP},
194 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
195 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
196 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
197 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
198 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
199 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
200 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
201 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
202 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
203 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
204 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
205 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
206 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
207 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
208 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
209 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
210 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
211 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
212 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
213 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
214 {BINARY, "BINARY", &NkfEncodingASCII},
221 } encoding_name_to_id_table[] = {
224 {"ISO-2022-JP", ISO_2022_JP},
225 {"ISO2022JP-CP932", CP50220},
226 {"CP50220", CP50220},
227 {"CP50221", CP50221},
228 {"CSISO2022JP", CP50221},
229 {"CP50222", CP50222},
230 {"ISO-2022-JP-1", ISO_2022_JP_1},
231 {"ISO-2022-JP-3", ISO_2022_JP_3},
232 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
233 {"SHIFT_JIS", SHIFT_JIS},
235 {"WINDOWS-31J", WINDOWS_31J},
236 {"CSWINDOWS31J", WINDOWS_31J},
237 {"CP932", WINDOWS_31J},
238 {"MS932", WINDOWS_31J},
239 {"CP10001", CP10001},
242 {"EUCJP-NKF", EUCJP_NKF},
243 {"CP51932", CP51932},
244 {"EUC-JP-MS", EUCJP_MS},
245 {"EUCJP-MS", EUCJP_MS},
246 {"EUCJPMS", EUCJP_MS},
247 {"EUC-JP-ASCII", EUCJP_ASCII},
248 {"EUCJP-ASCII", EUCJP_ASCII},
249 {"SHIFT_JISX0213", SHIFT_JISX0213},
250 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
251 {"EUC-JISX0213", EUC_JISX0213},
252 {"EUC-JIS-2004", EUC_JIS_2004},
255 {"UTF-8-BOM", UTF_8_BOM},
256 {"UTF8-MAC", UTF8_MAC},
257 {"UTF-8-MAC", UTF8_MAC},
259 {"UTF-16BE", UTF_16BE},
260 {"UTF-16BE-BOM", UTF_16BE_BOM},
261 {"UTF-16LE", UTF_16LE},
262 {"UTF-16LE-BOM", UTF_16LE_BOM},
264 {"UTF-32BE", UTF_32BE},
265 {"UTF-32BE-BOM", UTF_32BE_BOM},
266 {"UTF-32LE", UTF_32LE},
267 {"UTF-32LE-BOM", UTF_32LE_BOM},
272 #if defined(DEFAULT_CODE_JIS)
273 #define DEFAULT_ENCIDX ISO_2022_JP
274 #elif defined(DEFAULT_CODE_SJIS)
275 #define DEFAULT_ENCIDX SHIFT_JIS
276 #elif defined(DEFAULT_CODE_WINDOWS_31J)
277 #define DEFAULT_ENCIDX WINDOWS_31J
278 #elif defined(DEFAULT_CODE_EUC)
279 #define DEFAULT_ENCIDX EUC_JP
280 #elif defined(DEFAULT_CODE_UTF8)
281 #define DEFAULT_ENCIDX UTF_8
285 #define is_alnum(c) \
286 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
288 /* I don't trust portablity of toupper */
289 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
290 #define nkf_isoctal(c) ('0'<=c && c<='7')
291 #define nkf_isdigit(c) ('0'<=c && c<='9')
292 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
293 #define nkf_isblank(c) (c == SP || c == TAB)
294 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
295 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
296 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
297 #define nkf_isprint(c) (SP<=c && c<='~')
298 #define nkf_isgraph(c) ('!'<=c && c<='~')
299 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
300 ('A'<=c&&c<='F') ? (c-'A'+10) : \
301 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
302 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
303 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
304 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
305 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
306 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
308 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
309 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
311 #define HOLD_SIZE 1024
312 #if defined(INT_IS_SHORT)
313 #define IOBUF_SIZE 2048
315 #define IOBUF_SIZE 16384
318 #define DEFAULT_J 'B'
319 #define DEFAULT_R 'B'
326 /* MIME preprocessor */
328 #ifdef EASYWIN /*Easy Win */
329 extern POINT _BufferSize;
338 void (*status_func)(struct input_code *, nkf_char);
339 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
343 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
344 static nkf_encoding *input_encoding = NULL;
345 static nkf_encoding *output_encoding = NULL;
347 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
349 * 0: Shift_JIS, eucJP-ascii
354 #define UCS_MAP_ASCII 0
356 #define UCS_MAP_CP932 2
357 #define UCS_MAP_CP10001 3
358 static int ms_ucs_map_f = UCS_MAP_ASCII;
360 #ifdef UTF8_INPUT_ENABLE
361 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
362 static int no_cp932ext_f = FALSE;
363 /* ignore ZERO WIDTH NO-BREAK SPACE */
364 static int no_best_fit_chars_f = FALSE;
365 static int input_endian = ENDIAN_BIG;
366 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
367 static void (*encode_fallback)(nkf_char c) = NULL;
368 static void w_status(struct input_code *, nkf_char);
370 #ifdef UTF8_OUTPUT_ENABLE
371 static int output_bom_f = FALSE;
372 static int output_endian = ENDIAN_BIG;
375 static void std_putc(nkf_char c);
376 static nkf_char std_getc(FILE *f);
377 static nkf_char std_ungetc(nkf_char c,FILE *f);
379 static nkf_char broken_getc(FILE *f);
380 static nkf_char broken_ungetc(nkf_char c,FILE *f);
382 static nkf_char mime_getc(FILE *f);
384 static void mime_putc(nkf_char c);
388 #if !defined(PERL_XS) && !defined(WIN32DLL)
389 static unsigned char stdibuf[IOBUF_SIZE];
390 static unsigned char stdobuf[IOBUF_SIZE];
394 static int unbuf_f = FALSE;
395 static int estab_f = FALSE;
396 static int nop_f = FALSE;
397 static int binmode_f = TRUE; /* binary mode */
398 static int rot_f = FALSE; /* rot14/43 mode */
399 static int hira_f = FALSE; /* hira/kata henkan */
400 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
401 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
402 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
403 static int mimebuf_f = FALSE; /* MIME buffered input */
404 static int broken_f = FALSE; /* convert ESC-less broken JIS */
405 static int iso8859_f = FALSE; /* ISO8859 through */
406 static int mimeout_f = FALSE; /* base64 mode */
407 static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
408 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
410 #ifdef UNICODE_NORMALIZATION
411 static int nfc_f = FALSE;
412 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
413 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
417 static int cap_f = FALSE;
418 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
419 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
421 static int url_f = FALSE;
422 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
423 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
426 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
427 #define CLASS_MASK NKF_INT32_C(0xFF000000)
428 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
429 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
430 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
431 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
432 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
433 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
434 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
435 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
436 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
438 #ifdef NUMCHAR_OPTION
439 static int numchar_f = FALSE;
440 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
441 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
445 static int noout_f = FALSE;
446 static void no_putc(nkf_char c);
447 static int debug_f = FALSE;
448 static void debug(const char *str);
449 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
452 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
453 static void set_input_codename(const char *codename);
456 static int exec_f = 0;
459 #ifdef SHIFTJIS_CP932
460 /* invert IBM extended characters to others */
461 static int cp51932_f = FALSE;
463 /* invert NEC-selected IBM extended characters to IBM extended characters */
464 static int cp932inv_f = TRUE;
466 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
467 #endif /* SHIFTJIS_CP932 */
469 static int x0212_f = FALSE;
470 static int x0213_f = FALSE;
472 static unsigned char prefix_table[256];
474 static void e_status(struct input_code *, nkf_char);
475 static void s_status(struct input_code *, nkf_char);
477 struct input_code input_code_list[] = {
478 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
479 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
480 #ifdef UTF8_INPUT_ENABLE
481 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
486 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
487 static int base64_count = 0;
489 /* X0208 -> ASCII converter */
492 static int f_line = 0; /* chars in line */
493 static int f_prev = 0;
494 static int fold_preserve_f = FALSE; /* preserve new lines */
495 static int fold_f = FALSE;
496 static int fold_len = 0;
499 static unsigned char kanji_intro = DEFAULT_J;
500 static unsigned char ascii_intro = DEFAULT_R;
504 #define FOLD_MARGIN 10
505 #define DEFAULT_FOLD 60
507 static int fold_margin = FOLD_MARGIN;
509 /* process default */
512 no_connection2(nkf_char c2, nkf_char c1, nkf_char c0)
514 fprintf(stderr,"nkf internal module connection failure.\n");
520 no_connection(nkf_char c2, nkf_char c1)
522 no_connection2(c2,c1,0);
525 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
526 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
528 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
529 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
530 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
531 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
532 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
533 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
534 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
536 /* static redirections */
538 static void (*o_putc)(nkf_char c) = std_putc;
540 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
541 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
543 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
544 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
546 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
548 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
549 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
551 /* for strict mime */
552 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
553 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
556 static int output_mode = ASCII; /* output kanji mode */
557 static int input_mode = ASCII; /* input kanji mode */
558 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
560 /* X0201 / X0208 conversion tables */
562 /* X0201 kana conversion table */
564 static const unsigned char cv[]= {
565 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
566 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
567 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
568 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
569 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
570 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
571 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
572 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
573 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
574 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
575 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
576 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
577 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
578 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
579 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
580 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
584 /* X0201 kana conversion table for daguten */
586 static const unsigned char dv[]= {
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
592 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
593 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
594 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
595 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
596 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
598 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
605 /* X0201 kana conversion table for han-daguten */
607 static const unsigned char ev[]= {
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
618 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
619 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
627 /* X0208 kigou conversion table */
628 /* 0x8140 - 0x819e */
629 static const unsigned char fv[] = {
631 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
632 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
633 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
634 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
635 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
636 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
637 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
638 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
639 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
647 static int option_mode = 0;
648 static int file_out_f = FALSE;
650 static int overwrite_f = FALSE;
651 static int preserve_time_f = FALSE;
652 static int backup_f = FALSE;
653 static char *backup_suffix = "";
656 static int eolmode_f = 0; /* CR, LF, CRLF */
657 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
658 static nkf_char prev_cr = 0; /* CR or 0 */
659 #ifdef EASYWIN /*Easy Win */
660 static int end_check;
663 #define STD_GC_BUFSIZE (256)
664 nkf_char std_gc_buf[STD_GC_BUFSIZE];
668 nkf_xmalloc(size_t size)
672 if (size == 0) size = 1;
676 perror("can't malloc");
684 nkf_xrealloc(void *ptr, size_t size)
686 if (size == 0) size = 1;
688 ptr = realloc(ptr, size);
690 perror("can't realloc");
697 #define nkf_xfree(ptr) free(ptr)
700 nkf_str_caseeql(const char *src, const char *target)
703 for (i = 0; src[i] && target[i]; i++) {
704 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
706 if (src[i] || target[i]) return FALSE;
711 nkf_enc_from_index(int idx)
713 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
716 return &nkf_encoding_table[idx];
720 nkf_enc_find_index(const char *name)
723 if (name[0] == 'X' && *(name+1) == '-') name += 2;
724 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
725 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
726 return encoding_name_to_id_table[i].id;
733 nkf_enc_find(const char *name)
736 idx = nkf_enc_find_index(name);
737 if (idx < 0) return 0;
738 return nkf_enc_from_index(idx);
741 #define nkf_enc_name(enc) (enc)->name
742 #define nkf_enc_to_index(enc) (enc)->id
743 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
744 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
745 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
746 #define nkf_enc_asciicompat(enc) (\
747 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
748 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
749 #define nkf_enc_unicode_p(enc) (\
750 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
751 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
752 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
753 #define nkf_enc_cp5022x_p(enc) (\
754 nkf_enc_to_index(enc) == CP50220 ||\
755 nkf_enc_to_index(enc) == CP50221 ||\
756 nkf_enc_to_index(enc) == CP50222)
758 #ifdef DEFAULT_CODE_LOCALE
762 #ifdef HAVE_LANGINFO_H
763 return nl_langinfo(CODESET);
764 #elif defined(__WIN32__)
766 sprintf(buf, "CP%d", GetACP());
768 #elif defined(__OS2__)
769 # if defined(INT_IS_SHORT)
775 ULONG ulCP[1], ulncp;
776 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
777 if (ulCP[0] == 932 || ulCP[0] == 943)
778 strcpy(buf, "Shift_JIS");
780 sprintf(buf, "CP%lu", ulCP[0]);
788 nkf_locale_encoding()
790 nkf_encoding *enc = 0;
791 const char *encname = nkf_locale_charmap();
793 enc = nkf_enc_find(encname);
796 #endif /* DEFAULT_CODE_LOCALE */
801 return &nkf_encoding_table[UTF_8];
805 nkf_default_encoding()
807 nkf_encoding *enc = 0;
808 #ifdef DEFAULT_CODE_LOCALE
809 enc = nkf_locale_encoding();
810 #elif defined(DEFAULT_ENCIDX)
811 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
813 if (!enc) enc = nkf_utf8_encoding();
824 nkf_buf_new(int length)
826 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
827 buf->ptr = nkf_xmalloc(length);
834 nkf_buf_dispose(nkf_buf_t *buf)
840 #define nkf_buf_length(buf) ((buf)->len)
841 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
844 nkf_buf_at(nkf_buf_t *buf, int index)
846 assert(index <= buf->len);
847 return buf->ptr[index];
851 nkf_buf_clear(nkf_buf_t *buf)
857 nkf_buf_push(nkf_buf_t *buf, unsigned char c)
859 assert(buf->capa > buf->len);
860 buf->ptr[buf->len++] = c;
864 nkf_buf_pop(nkf_buf_t *buf)
866 assert(!nkf_buf_empty_p(buf));
867 return buf->ptr[--buf->len];
870 /* Normalization Form C */
873 #define fprintf dllprintf
879 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
886 "USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"
888 "b,u Output is buffered (DEFAULT),Output is unbuffered\n"
889 "j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
890 #ifdef UTF8_OUTPUT_ENABLE
891 " After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
893 "J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
894 #ifdef UTF8_INPUT_ENABLE
895 " After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
900 "i[@B] Specify the Esc Seq for JIS X 0208-1978/83 (DEFAULT B)\n"
901 "o[BJH] Specify the Esc Seq for ASCII/Roman (DEFAULT B)\n"
902 "r {de/en}crypt ROT13/47\n"
903 "h 1 katakana->hiragana, 2 hiragana->katakana, 3 both\n"
904 "m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
905 "M[BQ] MIME encode [B:base64 Q:quoted]\n"
906 "l ISO8859-1 (Latin-1) support\n"
907 "f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
910 "Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
911 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
912 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
913 "X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
914 "B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"
918 "T Text mode output\n"
920 "O Output to File (DEFAULT 'nkf.out')\n"
921 "I Convert non ISO-2022-JP charactor to GETA\n"
922 "d,c Convert line breaks -d: LF -c: CRLF\n"
923 "-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
924 "v, V Show this usage. V: show configuration\n"
927 "Long name options\n"
928 " --ic=<input codeset> --oc=<output codeset>\n"
929 " Specify the input or output codeset\n"
930 " --fj --unix --mac --windows\n"
931 " --jis --euc --sjis --utf8 --utf16 --mime --base64\n"
932 " Convert for the system or code\n"
933 " --hiragana --katakana --katakana-hiragana\n"
934 " To Hiragana/Katakana Conversion\n"
935 " --prefix= Insert escape before troublesome characters of Shift_JIS\n"
939 " --cap-input, --url-input Convert hex after ':' or '%%'\n"
941 #ifdef NUMCHAR_OPTION
942 " --numchar-input Convert Unicode Character Reference\n"
944 #ifdef UTF8_INPUT_ENABLE
945 " --fb-{skip, html, xml, perl, java, subchar}\n"
946 " Specify how nkf handles unassigned characters\n"
951 " --in-place[=SUFFIX] --overwrite[=SUFFIX]\n"
952 " Overwrite original listed files by filtered result\n"
953 " --overwrite preserves timestamp of original files\n"
955 " -g --guess Guess the input code\n"
956 " --help --version Show this help/the version\n"
957 " For more information, see also man nkf\n"
963 show_configuration(void)
966 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
967 " Compile-time options:\n"
968 " Compiled at: " __DATE__ " " __TIME__ "\n"
971 " Default output encoding: "
972 #ifdef DEFAULT_CODE_LOCALE
973 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
974 #elif defined(DEFAULT_ENCIDX)
975 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
981 " Default output end of line: "
982 #if DEFAULT_NEWLINE == CR
984 #elif DEFAULT_NEWLINE == CRLF
990 " Decode MIME encoded string: "
991 #if MIME_DECODE_DEFAULT
997 " Convert JIS X 0201 Katakana: "
1004 " --help, --version output: "
1005 #if HELP_OUTPUT_HELP_OUTPUT
1016 get_backup_filename(const char *suffix, const char *filename)
1018 char *backup_filename;
1019 int asterisk_count = 0;
1021 int filename_length = strlen(filename);
1023 for(i = 0; suffix[i]; i++){
1024 if(suffix[i] == '*') asterisk_count++;
1028 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1029 for(i = 0, j = 0; suffix[i];){
1030 if(suffix[i] == '*'){
1031 backup_filename[j] = '\0';
1032 strncat(backup_filename, filename, filename_length);
1034 j += filename_length;
1036 backup_filename[j++] = suffix[i++];
1039 backup_filename[j] = '\0';
1041 j = filename_length + strlen(suffix);
1042 backup_filename = nkf_xmalloc(j + 1);
1043 strcpy(backup_filename, filename);
1044 strcat(backup_filename, suffix);
1045 backup_filename[j] = '\0';
1047 return backup_filename;
1051 #ifdef UTF8_INPUT_ENABLE
1053 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1060 (*f)(0, bin2hex(c>>shift));
1071 encode_fallback_html(nkf_char c)
1076 if(c >= NKF_INT32_C(1000000))
1077 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1078 if(c >= NKF_INT32_C(100000))
1079 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1081 (*oconv)(0, 0x30+(c/10000 )%10);
1083 (*oconv)(0, 0x30+(c/1000 )%10);
1085 (*oconv)(0, 0x30+(c/100 )%10);
1087 (*oconv)(0, 0x30+(c/10 )%10);
1089 (*oconv)(0, 0x30+ c %10);
1095 encode_fallback_xml(nkf_char c)
1100 nkf_each_char_to_hex(oconv, c);
1106 encode_fallback_java(nkf_char c)
1110 if(!nkf_char_unicode_bmp_p(c)){
1114 (*oconv)(0, bin2hex(c>>20));
1115 (*oconv)(0, bin2hex(c>>16));
1119 (*oconv)(0, bin2hex(c>>12));
1120 (*oconv)(0, bin2hex(c>> 8));
1121 (*oconv)(0, bin2hex(c>> 4));
1122 (*oconv)(0, bin2hex(c ));
1127 encode_fallback_perl(nkf_char c)
1132 nkf_each_char_to_hex(oconv, c);
1138 encode_fallback_subchar(nkf_char c)
1140 c = unicode_subchar;
1141 (*oconv)((c>>8)&0xFF, c&0xFF);
1146 static const struct {
1170 {"katakana-hiragana","h3"},
1178 #ifdef UTF8_OUTPUT_ENABLE
1188 {"fb-subchar=", ""},
1190 #ifdef UTF8_INPUT_ENABLE
1191 {"utf8-input", "W"},
1192 {"utf16-input", "W16"},
1193 {"no-cp932ext", ""},
1194 {"no-best-fit-chars",""},
1196 #ifdef UNICODE_NORMALIZATION
1197 {"utf8mac-input", ""},
1209 #ifdef NUMCHAR_OPTION
1210 {"numchar-input", ""},
1216 #ifdef SHIFTJIS_CP932
1227 set_input_encoding(nkf_encoding *enc)
1229 switch (nkf_enc_to_index(enc)) {
1236 #ifdef SHIFTJIS_CP932
1239 #ifdef UTF8_OUTPUT_ENABLE
1240 ms_ucs_map_f = UCS_MAP_CP932;
1250 case ISO_2022_JP_2004:
1257 #ifdef SHIFTJIS_CP932
1260 #ifdef UTF8_OUTPUT_ENABLE
1261 ms_ucs_map_f = UCS_MAP_CP932;
1266 #ifdef SHIFTJIS_CP932
1269 #ifdef UTF8_OUTPUT_ENABLE
1270 ms_ucs_map_f = UCS_MAP_CP10001;
1278 #ifdef SHIFTJIS_CP932
1281 #ifdef UTF8_OUTPUT_ENABLE
1282 ms_ucs_map_f = UCS_MAP_CP932;
1286 #ifdef SHIFTJIS_CP932
1289 #ifdef UTF8_OUTPUT_ENABLE
1290 ms_ucs_map_f = UCS_MAP_MS;
1294 #ifdef SHIFTJIS_CP932
1297 #ifdef UTF8_OUTPUT_ENABLE
1298 ms_ucs_map_f = UCS_MAP_ASCII;
1301 case SHIFT_JISX0213:
1302 case SHIFT_JIS_2004:
1304 #ifdef SHIFTJIS_CP932
1311 #ifdef SHIFTJIS_CP932
1315 #ifdef UTF8_INPUT_ENABLE
1316 #ifdef UNICODE_NORMALIZATION
1324 input_endian = ENDIAN_BIG;
1328 input_endian = ENDIAN_LITTLE;
1333 input_endian = ENDIAN_BIG;
1337 input_endian = ENDIAN_LITTLE;
1344 set_output_encoding(nkf_encoding *enc)
1346 switch (nkf_enc_to_index(enc)) {
1349 #ifdef SHIFTJIS_CP932
1350 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1352 #ifdef UTF8_OUTPUT_ENABLE
1353 ms_ucs_map_f = UCS_MAP_CP932;
1357 #ifdef SHIFTJIS_CP932
1358 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1360 #ifdef UTF8_OUTPUT_ENABLE
1361 ms_ucs_map_f = UCS_MAP_CP932;
1366 #ifdef SHIFTJIS_CP932
1367 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1373 #ifdef SHIFTJIS_CP932
1374 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1380 #ifdef UTF8_OUTPUT_ENABLE
1381 ms_ucs_map_f = UCS_MAP_CP932;
1385 #ifdef UTF8_OUTPUT_ENABLE
1386 ms_ucs_map_f = UCS_MAP_CP10001;
1391 #ifdef SHIFTJIS_CP932
1392 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1394 #ifdef UTF8_OUTPUT_ENABLE
1395 ms_ucs_map_f = UCS_MAP_ASCII;
1400 #ifdef SHIFTJIS_CP932
1401 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1403 #ifdef UTF8_OUTPUT_ENABLE
1404 ms_ucs_map_f = UCS_MAP_ASCII;
1408 #ifdef SHIFTJIS_CP932
1409 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1411 #ifdef UTF8_OUTPUT_ENABLE
1412 ms_ucs_map_f = UCS_MAP_CP932;
1417 #ifdef UTF8_OUTPUT_ENABLE
1418 ms_ucs_map_f = UCS_MAP_MS;
1423 #ifdef UTF8_OUTPUT_ENABLE
1424 ms_ucs_map_f = UCS_MAP_ASCII;
1427 case SHIFT_JISX0213:
1428 case SHIFT_JIS_2004:
1430 #ifdef SHIFTJIS_CP932
1431 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1438 #ifdef SHIFTJIS_CP932
1439 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1442 #ifdef UTF8_OUTPUT_ENABLE
1444 output_bom_f = TRUE;
1448 output_bom_f = TRUE;
1451 output_endian = ENDIAN_LITTLE;
1452 output_bom_f = FALSE;
1455 output_endian = ENDIAN_LITTLE;
1456 output_bom_f = TRUE;
1459 output_bom_f = TRUE;
1462 output_endian = ENDIAN_LITTLE;
1463 output_bom_f = FALSE;
1466 output_endian = ENDIAN_LITTLE;
1467 output_bom_f = TRUE;
1473 static struct input_code*
1474 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1477 struct input_code *p = input_code_list;
1479 if (iconv_func == p->iconv_func){
1489 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1491 #ifdef INPUT_CODE_FIX
1492 if (f || !input_encoding)
1499 #ifdef INPUT_CODE_FIX
1500 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1506 if (estab_f && iconv_for_check != iconv){
1507 struct input_code *p = find_inputcode_byfunc(iconv);
1509 set_input_codename(p->name);
1512 iconv_for_check = iconv;
1519 x0212_shift(nkf_char c)
1524 if (0x75 <= c && c <= 0x7f){
1525 ret = c + (0x109 - 0x75);
1528 if (0x75 <= c && c <= 0x7f){
1529 ret = c + (0x113 - 0x75);
1537 x0212_unshift(nkf_char c)
1540 if (0x7f <= c && c <= 0x88){
1541 ret = c + (0x75 - 0x7f);
1542 }else if (0x89 <= c && c <= 0x92){
1543 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1547 #endif /* X0212_ENABLE */
1550 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1556 if((0x21 <= ndx && ndx <= 0x2F)){
1557 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1558 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1560 }else if(0x6E <= ndx && ndx <= 0x7E){
1561 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1562 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1568 else if(nkf_isgraph(ndx)){
1570 const unsigned short *ptr;
1571 ptr = x0212_shiftjis[ndx - 0x21];
1573 val = ptr[(c1 & 0x7f) - 0x21];
1582 c2 = x0212_shift(c2);
1584 #endif /* X0212_ENABLE */
1586 if(0x7F < c2) return 1;
1587 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1588 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1593 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1595 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1598 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1599 if (0xFC < c1) return 1;
1600 #ifdef SHIFTJIS_CP932
1601 if (!cp932inv_f && is_ibmext_in_sjis(c2)){
1602 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1609 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1610 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1616 #endif /* SHIFTJIS_CP932 */
1618 if (!x0213_f && is_ibmext_in_sjis(c2)){
1619 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1622 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1635 if(x0213_f && c2 >= 0xF0){
1636 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1637 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1638 }else{ /* 78<=k<=94 */
1639 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1640 if (0x9E < c1) c2++;
1643 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1644 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1645 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1646 if (0x9E < c1) c2++;
1649 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1656 c2 = x0212_unshift(c2);
1663 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1665 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1673 }else if (val < 0x800){
1674 *p1 = 0xc0 | (val >> 6);
1675 *p2 = 0x80 | (val & 0x3f);
1678 } else if (nkf_char_unicode_bmp_p(val)) {
1679 *p1 = 0xe0 | (val >> 12);
1680 *p2 = 0x80 | ((val >> 6) & 0x3f);
1681 *p3 = 0x80 | ( val & 0x3f);
1683 } else if (nkf_char_unicode_value_p(val)) {
1684 *p1 = 0xe0 | (val >> 16);
1685 *p2 = 0x80 | ((val >> 12) & 0x3f);
1686 *p3 = 0x80 | ((val >> 6) & 0x3f);
1687 *p4 = 0x80 | ( val & 0x3f);
1697 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1704 else if (c1 <= 0xC3) {
1705 /* trail byte or invalid */
1708 else if (c1 <= 0xDF) {
1710 wc = (c1 & 0x1F) << 6;
1713 else if (c1 <= 0xEF) {
1715 wc = (c1 & 0x0F) << 12;
1716 wc |= (c2 & 0x3F) << 6;
1719 else if (c2 <= 0xF4) {
1721 wc = (c1 & 0x0F) << 18;
1722 wc |= (c2 & 0x3F) << 12;
1723 wc |= (c3 & 0x3F) << 6;
1733 #ifdef UTF8_INPUT_ENABLE
1735 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1736 const unsigned short *const *pp, nkf_char psize,
1737 nkf_char *p2, nkf_char *p1)
1740 const unsigned short *p;
1743 if (pp == 0) return 1;
1746 if (c1 < 0 || psize <= c1) return 1;
1748 if (p == 0) return 1;
1751 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1753 if (val == 0) return 1;
1754 if (no_cp932ext_f && (
1755 (val>>8) == 0x2D || /* NEC special characters */
1756 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1764 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1772 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1774 const unsigned short *const *pp;
1775 const unsigned short *const *const *ppp;
1776 static const char no_best_fit_chars_table_C2[] =
1777 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1779 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1780 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1781 static const char no_best_fit_chars_table_C2_ms[] =
1782 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1784 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1785 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1786 static const char no_best_fit_chars_table_932_C2[] =
1787 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1788 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1789 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1790 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1791 static const char no_best_fit_chars_table_932_C3[] =
1792 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1793 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1794 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1795 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1801 }else if(c2 < 0xe0){
1802 if(no_best_fit_chars_f){
1803 if(ms_ucs_map_f == UCS_MAP_CP932){
1806 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1809 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1812 }else if(!cp932inv_f){
1815 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1818 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1821 }else if(ms_ucs_map_f == UCS_MAP_MS){
1822 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1823 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1841 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1842 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1843 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1845 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1846 }else if(c0 < 0xF0){
1847 if(no_best_fit_chars_f){
1848 if(ms_ucs_map_f == UCS_MAP_CP932){
1849 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1850 }else if(ms_ucs_map_f == UCS_MAP_MS){
1855 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1858 if(c0 == 0x92) return 1;
1863 if(c1 == 0x80 || c0 == 0x9C) return 1;
1866 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1871 if(c0 == 0x94) return 1;
1874 if(c0 == 0xBB) return 1;
1884 if(c0 == 0x95) return 1;
1887 if(c0 == 0xA5) return 1;
1894 if(c0 == 0x8D) return 1;
1897 if(c0 == 0x9E && !cp932inv_f) return 1;
1900 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1908 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1909 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1910 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1912 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1914 #ifdef SHIFTJIS_CP932
1915 if (!ret && !cp932inv_f && is_eucg3(*p2)) {
1917 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1918 s2e_conv(s2, s1, p2, p1);
1927 #ifdef UTF8_OUTPUT_ENABLE
1929 e2w_conv(nkf_char c2, nkf_char c1)
1931 const unsigned short *p;
1933 if (c2 == JIS_X_0201_1976_K) {
1934 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1942 p = euc_to_utf8_1byte;
1944 } else if (is_eucg3(c2)){
1945 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
1948 c2 = (c2&0x7f) - 0x21;
1949 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1950 p = x0212_to_utf8_2bytes[c2];
1956 c2 = (c2&0x7f) - 0x21;
1957 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
1959 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
1960 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
1961 euc_to_utf8_2bytes_ms[c2];
1966 c1 = (c1 & 0x7f) - 0x21;
1967 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte)
1974 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1981 }else if (0xc0 <= c2 && c2 <= 0xef) {
1982 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
1983 #ifdef NUMCHAR_OPTION
1986 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
1994 #ifdef UTF8_INPUT_ENABLE
1996 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
1998 nkf_char c1, c2, c3, c4;
2005 else if (nkf_char_unicode_bmp_p(val)){
2006 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2007 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
2010 *p1 = nkf_char_unicode_new(val);
2016 *p1 = nkf_char_unicode_new(val);
2023 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2025 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2026 if (iso2022jp_f && !x0201_f) {
2027 c2 = GETA1; c1 = GETA2;
2029 c2 = JIS_X_0201_1976_K;
2033 }else if (c2 == 0x8f){
2037 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2038 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2039 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2042 c2 = (c2 << 8) | (c1 & 0x7f);
2044 #ifdef SHIFTJIS_CP932
2047 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2048 s2e_conv(s2, s1, &c2, &c1);
2055 #endif /* SHIFTJIS_CP932 */
2057 #endif /* X0212_ENABLE */
2058 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2061 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2062 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2063 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2068 #ifdef SHIFTJIS_CP932
2069 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2071 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2072 s2e_conv(s2, s1, &c2, &c1);
2079 #endif /* SHIFTJIS_CP932 */
2087 s_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2089 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2090 if (iso2022jp_f && !x0201_f) {
2091 c2 = GETA1; c1 = GETA2;
2095 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2097 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2099 if(c1 == 0x7F) return 0;
2100 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2103 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2104 if (ret) return ret;
2111 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2113 nkf_char ret = 0, c4 = 0;
2114 static const char w_iconv_utf8_1st_byte[] =
2116 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2117 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2118 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2119 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2126 if (c1 < 0 || 0xff < c1) {
2127 }else if (c1 == 0) { /* 0 : 1 byte*/
2129 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2132 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2134 if (c2 < 0x80 || 0xBF < c2) return 0;
2137 if (c3 == 0) return -1;
2138 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2143 if (c3 == 0) return -1;
2144 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2148 if (c3 == 0) return -1;
2149 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2153 if (c3 == 0) return -2;
2154 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2158 if (c3 == 0) return -2;
2159 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2163 if (c3 == 0) return -2;
2164 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2172 if (c1 == 0 || c1 == EOF){
2173 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2174 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2177 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2185 #define NKF_ICONV_INVALID_CODE_RANGE -13
2187 unicode_iconv(nkf_char wc)
2195 }else if ((wc>>11) == 27) {
2196 /* unpaired surrogate */
2197 return NKF_ICONV_INVALID_CODE_RANGE;
2198 }else if (wc < 0xFFFF) {
2199 ret = w16e_conv(wc, &c2, &c1);
2200 if (ret) return ret;
2201 }else if (wc < 0x10FFFF) {
2203 c1 = nkf_char_unicode_new(wc);
2205 return NKF_ICONV_INVALID_CODE_RANGE;
2211 #define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2212 #define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2213 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2215 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2224 if (input_endian == ENDIAN_BIG) {
2225 if (0xD8 <= c1 && c1 <= 0xDB) {
2226 if (0xDC <= c3 && c3 <= 0xDF) {
2227 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2228 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2233 if (0xD8 <= c2 && c2 <= 0xDB) {
2234 if (0xDC <= c4 && c4 <= 0xDF) {
2235 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2236 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2242 return (*unicode_iconv)(wc);
2246 w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2252 w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2258 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2267 switch(input_endian){
2269 wc = c2 << 16 | c3 << 8 | c4;
2272 wc = c3 << 16 | c2 << 8 | c1;
2275 wc = c1 << 16 | c4 << 8 | c3;
2278 wc = c4 << 16 | c1 << 8 | c2;
2281 return NKF_ICONV_INVALID_CODE_RANGE;
2284 return (*unicode_iconv)(wc);
2288 #define output_ascii_escape_sequence(mode) do { \
2289 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2292 (*o_putc)(ascii_intro); \
2293 output_mode = mode; \
2298 output_escape_sequence(int mode)
2300 if (output_mode == mode)
2308 case JIS_X_0201_1976_K:
2316 (*o_putc)(kanji_intro);
2341 j_oconv(nkf_char c2, nkf_char c1)
2343 #ifdef NUMCHAR_OPTION
2344 if (c2 == 0 && nkf_char_unicode_p(c1)){
2345 w16e_conv(c1, &c2, &c1);
2346 if (c2 == 0 && nkf_char_unicode_p(c1)){
2347 c2 = c1 & VALUE_MASK;
2348 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2351 c2 = 0x7F + c1 / 94;
2352 c1 = 0x21 + c1 % 94;
2354 if (encode_fallback) (*encode_fallback)(c1);
2361 output_ascii_escape_sequence(ASCII);
2364 else if (c2 == EOF) {
2365 output_ascii_escape_sequence(ASCII);
2368 else if (c2 == ISO_8859_1) {
2369 output_ascii_escape_sequence(ISO_8859_1);
2372 else if (c2 == JIS_X_0201_1976_K) {
2373 output_escape_sequence(JIS_X_0201_1976_K);
2376 } else if (is_eucg3(c2)){
2377 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2378 (*o_putc)(c2 & 0x7f);
2383 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2384 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2385 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2392 e_oconv(nkf_char c2, nkf_char c1)
2394 if (c2 == 0 && nkf_char_unicode_p(c1)){
2395 w16e_conv(c1, &c2, &c1);
2396 if (c2 == 0 && nkf_char_unicode_p(c1)){
2397 c2 = c1 & VALUE_MASK;
2398 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2402 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2403 c1 = 0x21 + c1 % 94;
2406 (*o_putc)((c2 & 0x7f) | 0x080);
2407 (*o_putc)(c1 | 0x080);
2409 (*o_putc)((c2 & 0x7f) | 0x080);
2410 (*o_putc)(c1 | 0x080);
2414 if (encode_fallback) (*encode_fallback)(c1);
2422 } else if (c2 == 0) {
2423 output_mode = ASCII;
2425 } else if (c2 == JIS_X_0201_1976_K) {
2426 output_mode = EUC_JP;
2427 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2428 } else if (c2 == ISO_8859_1) {
2429 output_mode = ISO_8859_1;
2430 (*o_putc)(c1 | 0x080);
2432 } else if (is_eucg3(c2)){
2433 output_mode = EUC_JP;
2434 #ifdef SHIFTJIS_CP932
2437 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2438 s2e_conv(s2, s1, &c2, &c1);
2443 output_mode = ASCII;
2445 }else if (is_eucg3(c2)){
2448 (*o_putc)((c2 & 0x7f) | 0x080);
2449 (*o_putc)(c1 | 0x080);
2452 (*o_putc)((c2 & 0x7f) | 0x080);
2453 (*o_putc)(c1 | 0x080);
2457 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2458 set_iconv(FALSE, 0);
2459 return; /* too late to rescue this char */
2461 output_mode = EUC_JP;
2462 (*o_putc)(c2 | 0x080);
2463 (*o_putc)(c1 | 0x080);
2468 s_oconv(nkf_char c2, nkf_char c1)
2470 #ifdef NUMCHAR_OPTION
2471 if (c2 == 0 && nkf_char_unicode_p(c1)){
2472 w16e_conv(c1, &c2, &c1);
2473 if (c2 == 0 && nkf_char_unicode_p(c1)){
2474 c2 = c1 & VALUE_MASK;
2475 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2478 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2480 c1 += 0x40 + (c1 > 0x3e);
2485 if(encode_fallback)(*encode_fallback)(c1);
2494 } else if (c2 == 0) {
2495 output_mode = ASCII;
2497 } else if (c2 == JIS_X_0201_1976_K) {
2498 output_mode = SHIFT_JIS;
2500 } else if (c2 == ISO_8859_1) {
2501 output_mode = ISO_8859_1;
2502 (*o_putc)(c1 | 0x080);
2504 } else if (is_eucg3(c2)){
2505 output_mode = SHIFT_JIS;
2506 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2512 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2513 set_iconv(FALSE, 0);
2514 return; /* too late to rescue this char */
2516 output_mode = SHIFT_JIS;
2517 e2s_conv(c2, c1, &c2, &c1);
2519 #ifdef SHIFTJIS_CP932
2521 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2522 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2528 #endif /* SHIFTJIS_CP932 */
2531 if (prefix_table[(unsigned char)c1]){
2532 (*o_putc)(prefix_table[(unsigned char)c1]);
2538 #ifdef UTF8_OUTPUT_ENABLE
2540 w_oconv(nkf_char c2, nkf_char c1)
2546 output_bom_f = FALSE;
2557 if (c2 == 0 && nkf_char_unicode_p(c1)){
2558 val = c1 & VALUE_MASK;
2559 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2561 if (c2) (*o_putc)(c2);
2562 if (c3) (*o_putc)(c3);
2563 if (c4) (*o_putc)(c4);
2570 val = e2w_conv(c2, c1);
2572 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2574 if (c2) (*o_putc)(c2);
2575 if (c3) (*o_putc)(c3);
2576 if (c4) (*o_putc)(c4);
2582 w_oconv16(nkf_char c2, nkf_char c1)
2585 output_bom_f = FALSE;
2586 if (output_endian == ENDIAN_LITTLE){
2600 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2601 if (nkf_char_unicode_bmp_p(c1)) {
2602 c2 = (c1 >> 8) & 0xff;
2606 if (c1 <= UNICODE_MAX) {
2607 c2 = (c1 >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2608 c1 = (c1 & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2609 if (output_endian == ENDIAN_LITTLE){
2610 (*o_putc)(c2 & 0xff);
2611 (*o_putc)((c2 >> 8) & 0xff);
2612 (*o_putc)(c1 & 0xff);
2613 (*o_putc)((c1 >> 8) & 0xff);
2615 (*o_putc)((c2 >> 8) & 0xff);
2616 (*o_putc)(c2 & 0xff);
2617 (*o_putc)((c1 >> 8) & 0xff);
2618 (*o_putc)(c1 & 0xff);
2624 nkf_char val = e2w_conv(c2, c1);
2625 c2 = (val >> 8) & 0xff;
2630 if (output_endian == ENDIAN_LITTLE){
2640 w_oconv32(nkf_char c2, nkf_char c1)
2643 output_bom_f = FALSE;
2644 if (output_endian == ENDIAN_LITTLE){
2662 if (c2 == ISO_8859_1) {
2664 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2667 c1 = e2w_conv(c2, c1);
2670 if (output_endian == ENDIAN_LITTLE){
2671 (*o_putc)( c1 & 0xFF);
2672 (*o_putc)((c1 >> 8) & 0xFF);
2673 (*o_putc)((c1 >> 16) & 0xFF);
2677 (*o_putc)((c1 >> 16) & 0xFF);
2678 (*o_putc)((c1 >> 8) & 0xFF);
2679 (*o_putc)( c1 & 0xFF);
2684 #define SCORE_L2 (1) /*
\e$BBh
\e(B2
\e$B?e=`4A;z
\e(B */
2685 #define SCORE_KANA (SCORE_L2 << 1) /*
\e$B$$$o$f$kH>3Q%+%J
\e(B */
2686 #define SCORE_DEPEND (SCORE_KANA << 1) /*
\e$B5!<o0MB8J8;z
\e(B */
2687 #define SCORE_CP932 (SCORE_DEPEND << 1) /* CP932
\e$B$K$h$kFI$_49$(
\e(B (IBM extended characters) */
2688 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2689 #define SCORE_NO_EXIST (SCORE_X0212 << 1) /*
\e$BB8:_$7$J$$J8;z
\e(B */
2690 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME
\e$B$K$h$k;XDj
\e(B */
2691 #define SCORE_ERROR (SCORE_iMIME << 1) /*
\e$B%(%i!<
\e(B */
2693 #define SCORE_INIT (SCORE_iMIME)
2695 static const nkf_char score_table_A0[] = {
2698 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2699 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_NO_EXIST,
2702 static const nkf_char score_table_F0[] = {
2703 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2704 SCORE_L2, SCORE_DEPEND, SCORE_NO_EXIST, SCORE_NO_EXIST,
2705 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2706 SCORE_CP932, SCORE_NO_EXIST, SCORE_NO_EXIST, SCORE_ERROR,
2710 set_code_score(struct input_code *ptr, nkf_char score)
2713 ptr->score |= score;
2718 clr_code_score(struct input_code *ptr, nkf_char score)
2721 ptr->score &= ~score;
2726 code_score(struct input_code *ptr)
2728 nkf_char c2 = ptr->buf[0];
2729 #ifdef UTF8_OUTPUT_ENABLE
2730 nkf_char c1 = ptr->buf[1];
2733 set_code_score(ptr, SCORE_ERROR);
2734 }else if (c2 == SS2){
2735 set_code_score(ptr, SCORE_KANA);
2736 }else if (c2 == 0x8f){
2737 set_code_score(ptr, SCORE_X0212);
2738 #ifdef UTF8_OUTPUT_ENABLE
2739 }else if (!e2w_conv(c2, c1)){
2740 set_code_score(ptr, SCORE_NO_EXIST);
2742 }else if ((c2 & 0x70) == 0x20){
2743 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
2744 }else if ((c2 & 0x70) == 0x70){
2745 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
2746 }else if ((c2 & 0x70) >= 0x50){
2747 set_code_score(ptr, SCORE_L2);
2752 status_disable(struct input_code *ptr)
2757 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
2761 status_push_ch(struct input_code *ptr, nkf_char c)
2763 ptr->buf[ptr->index++] = c;
2767 status_clear(struct input_code *ptr)
2774 status_reset(struct input_code *ptr)
2777 ptr->score = SCORE_INIT;
2781 status_reinit(struct input_code *ptr)
2784 ptr->_file_stat = 0;
2788 status_check(struct input_code *ptr, nkf_char c)
2790 if (c <= DEL && estab_f){
2796 s_status(struct input_code *ptr, nkf_char c)
2800 status_check(ptr, c);
2805 }else if (nkf_char_unicode_p(c)){
2807 }else if (0xa1 <= c && c <= 0xdf){
2808 status_push_ch(ptr, SS2);
2809 status_push_ch(ptr, c);
2812 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
2814 status_push_ch(ptr, c);
2815 }else if (0xed <= c && c <= 0xee){
2817 status_push_ch(ptr, c);
2818 #ifdef SHIFTJIS_CP932
2819 }else if (is_ibmext_in_sjis(c)){
2821 status_push_ch(ptr, c);
2822 #endif /* SHIFTJIS_CP932 */
2824 }else if (0xf0 <= c && c <= 0xfc){
2826 status_push_ch(ptr, c);
2827 #endif /* X0212_ENABLE */
2829 status_disable(ptr);
2833 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2834 status_push_ch(ptr, c);
2835 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2839 status_disable(ptr);
2843 #ifdef SHIFTJIS_CP932
2844 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
2845 status_push_ch(ptr, c);
2846 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
2847 set_code_score(ptr, SCORE_CP932);
2852 #endif /* SHIFTJIS_CP932 */
2853 status_disable(ptr);
2856 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
2857 status_push_ch(ptr, c);
2858 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
2859 set_code_score(ptr, SCORE_CP932);
2862 status_disable(ptr);
2869 e_status(struct input_code *ptr, nkf_char c)
2873 status_check(ptr, c);
2878 }else if (nkf_char_unicode_p(c)){
2880 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
2882 status_push_ch(ptr, c);
2884 }else if (0x8f == c){
2886 status_push_ch(ptr, c);
2887 #endif /* X0212_ENABLE */
2889 status_disable(ptr);
2893 if (0xa1 <= c && c <= 0xfe){
2894 status_push_ch(ptr, c);
2898 status_disable(ptr);
2903 if (0xa1 <= c && c <= 0xfe){
2905 status_push_ch(ptr, c);
2907 status_disable(ptr);
2909 #endif /* X0212_ENABLE */
2913 #ifdef UTF8_INPUT_ENABLE
2915 w_status(struct input_code *ptr, nkf_char c)
2919 status_check(ptr, c);
2924 }else if (nkf_char_unicode_p(c)){
2926 }else if (0xc0 <= c && c <= 0xdf){
2928 status_push_ch(ptr, c);
2929 }else if (0xe0 <= c && c <= 0xef){
2931 status_push_ch(ptr, c);
2932 }else if (0xf0 <= c && c <= 0xf4){
2934 status_push_ch(ptr, c);
2936 status_disable(ptr);
2941 if (0x80 <= c && c <= 0xbf){
2942 status_push_ch(ptr, c);
2943 if (ptr->index > ptr->stat){
2944 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
2945 && ptr->buf[2] == 0xbf);
2946 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
2947 &ptr->buf[0], &ptr->buf[1]);
2954 status_disable(ptr);
2958 if (0x80 <= c && c <= 0xbf){
2959 if (ptr->index < ptr->stat){
2960 status_push_ch(ptr, c);
2965 status_disable(ptr);
2973 code_status(nkf_char c)
2975 int action_flag = 1;
2976 struct input_code *result = 0;
2977 struct input_code *p = input_code_list;
2979 if (!p->status_func) {
2983 if (!p->status_func)
2985 (p->status_func)(p, c);
2988 }else if(p->stat == 0){
2999 if (result && !estab_f){
3000 set_iconv(TRUE, result->iconv_func);
3001 }else if (c <= DEL){
3002 struct input_code *ptr = input_code_list;
3016 return std_gc_buf[--std_gc_ndx];
3023 std_ungetc(nkf_char c, FILE *f)
3025 if (std_gc_ndx == STD_GC_BUFSIZE){
3028 std_gc_buf[std_gc_ndx++] = c;
3034 std_putc(nkf_char c)
3041 static unsigned char hold_buf[HOLD_SIZE*2];
3042 static int hold_count = 0;
3044 push_hold_buf(nkf_char c2)
3046 if (hold_count >= HOLD_SIZE*2)
3048 hold_buf[hold_count++] = (unsigned char)c2;
3049 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3053 h_conv(FILE *f, int c1, int c2)
3059 /** it must NOT be in the kanji shifte sequence */
3060 /** it must NOT be written in JIS7 */
3061 /** and it must be after 2 byte 8bit code */
3067 while ((c2 = (*i_getc)(f)) != EOF) {
3073 if (push_hold_buf(c2) == EOF || estab_f) {
3079 struct input_code *p = input_code_list;
3080 struct input_code *result = p;
3085 if (p->status_func && p->score < result->score) {
3090 set_iconv(TRUE, result->iconv_func);
3095 ** 1) EOF is detected, or
3096 ** 2) Code is established, or
3097 ** 3) Buffer is FULL (but last word is pushed)
3099 ** in 1) and 3) cases, we continue to use
3100 ** Kanji codes by oconv and leave estab_f unchanged.
3105 while (hold_index < hold_count){
3106 c1 = hold_buf[hold_index++];
3110 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3111 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3114 if (hold_index < hold_count){
3115 c2 = hold_buf[hold_index++];
3125 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3128 if (hold_index < hold_count){
3129 c3 = hold_buf[hold_index++];
3130 } else if ((c3 = (*i_getc)(f)) == EOF) {
3135 if (hold_index < hold_count){
3136 c4 = hold_buf[hold_index++];
3137 } else if ((c4 = (*i_getc)(f)) == EOF) {
3142 (*iconv)(c1, c2, (c3<<8)|c4);
3147 /* 3 bytes EUC or UTF-8 */
3148 if (hold_index < hold_count){
3149 c3 = hold_buf[hold_index++];
3150 } else if ((c3 = (*i_getc)(f)) == EOF) {
3156 (*iconv)(c1, c2, c3);
3159 if (c3 == EOF) break;
3165 * Check and Ignore BOM
3171 switch(c2 = (*i_getc)(f)){
3173 if((c2 = (*i_getc)(f)) == 0x00){
3174 if((c2 = (*i_getc)(f)) == 0xFE){
3175 if((c2 = (*i_getc)(f)) == 0xFF){
3176 if(!input_encoding){
3177 set_iconv(TRUE, w_iconv32);
3179 if (iconv == w_iconv32) {
3180 input_endian = ENDIAN_BIG;
3183 (*i_ungetc)(0xFF,f);
3184 }else (*i_ungetc)(c2,f);
3185 (*i_ungetc)(0xFE,f);
3186 }else if(c2 == 0xFF){
3187 if((c2 = (*i_getc)(f)) == 0xFE){
3188 if(!input_encoding){
3189 set_iconv(TRUE, w_iconv32);
3191 if (iconv == w_iconv32) {
3192 input_endian = ENDIAN_2143;
3195 (*i_ungetc)(0xFF,f);
3196 }else (*i_ungetc)(c2,f);
3197 (*i_ungetc)(0xFF,f);
3198 }else (*i_ungetc)(c2,f);
3199 (*i_ungetc)(0x00,f);
3200 }else (*i_ungetc)(c2,f);
3201 (*i_ungetc)(0x00,f);
3204 if((c2 = (*i_getc)(f)) == 0xBB){
3205 if((c2 = (*i_getc)(f)) == 0xBF){
3206 if(!input_encoding){
3207 set_iconv(TRUE, w_iconv);
3209 if (iconv == w_iconv) {
3212 (*i_ungetc)(0xBF,f);
3213 }else (*i_ungetc)(c2,f);
3214 (*i_ungetc)(0xBB,f);
3215 }else (*i_ungetc)(c2,f);
3216 (*i_ungetc)(0xEF,f);
3219 if((c2 = (*i_getc)(f)) == 0xFF){
3220 if((c2 = (*i_getc)(f)) == 0x00){
3221 if((c2 = (*i_getc)(f)) == 0x00){
3222 if(!input_encoding){
3223 set_iconv(TRUE, w_iconv32);
3225 if (iconv == w_iconv32) {
3226 input_endian = ENDIAN_3412;
3229 (*i_ungetc)(0x00,f);
3230 }else (*i_ungetc)(c2,f);
3231 (*i_ungetc)(0x00,f);
3232 }else (*i_ungetc)(c2,f);
3233 if(!input_encoding){
3234 set_iconv(TRUE, w_iconv16);
3236 if (iconv == w_iconv16) {
3237 input_endian = ENDIAN_BIG;
3240 (*i_ungetc)(0xFF,f);
3241 }else (*i_ungetc)(c2,f);
3242 (*i_ungetc)(0xFE,f);
3245 if((c2 = (*i_getc)(f)) == 0xFE){
3246 if((c2 = (*i_getc)(f)) == 0x00){
3247 if((c2 = (*i_getc)(f)) == 0x00){
3248 if(!input_encoding){
3249 set_iconv(TRUE, w_iconv32);
3251 if (iconv == w_iconv32) {
3252 input_endian = ENDIAN_LITTLE;
3255 (*i_ungetc)(0x00,f);
3256 }else (*i_ungetc)(c2,f);
3257 (*i_ungetc)(0x00,f);
3258 }else (*i_ungetc)(c2,f);
3259 if(!input_encoding){
3260 set_iconv(TRUE, w_iconv16);
3262 if (iconv == w_iconv16) {
3263 input_endian = ENDIAN_LITTLE;
3266 (*i_ungetc)(0xFE,f);
3267 }else (*i_ungetc)(c2,f);
3268 (*i_ungetc)(0xFF,f);
3283 init_broken_state(void)
3285 memset(&broken_state, 0, sizeof(broken_state));
3291 broken_state.buf[broken_state.count++] = c;
3295 pop_broken_buf(void)
3297 return broken_state.buf[--broken_state.count];
3301 broken_getc(FILE *f)
3305 if (broken_state.count > 0) {
3306 return pop_broken_buf();
3309 if (c=='$' && broken_state.status != ESC
3310 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3312 broken_state.status = 0;
3313 if (c1=='@'|| c1=='B') {
3314 push_broken_buf(c1);
3321 } else if (c=='(' && broken_state.status != ESC
3322 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3324 broken_state.status = 0;
3325 if (c1=='J'|| c1=='B') {
3326 push_broken_buf(c1);
3334 broken_state.status = c;
3340 broken_ungetc(nkf_char c, FILE *f)
3342 if (broken_state.count < 2)
3348 eol_conv(nkf_char c2, nkf_char c1)
3350 if (guess_f && input_eol != EOF) {
3351 if (c2 == 0 && c1 == LF) {
3352 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3353 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3354 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3356 else if (!input_eol) input_eol = CR;
3357 else if (input_eol != CR) input_eol = EOF;
3359 if (prev_cr || (c2 == 0 && c1 == LF)) {
3361 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3362 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3364 if (c2 == 0 && c1 == CR) prev_cr = CR;
3365 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3369 Return value of fold_conv()
3371 LF add newline and output char
3372 CR add newline and output nothing
3375 1 (or else) normal output
3377 fold state in prev (previous character)
3379 >0x80 Japanese (X0208/X0201)
3384 This fold algorthm does not preserve heading space in a line.
3385 This is the main difference from fmt.
3388 #define char_size(c2,c1) (c2?2:1)
3391 fold_conv(nkf_char c2, nkf_char c1)
3394 nkf_char fold_state;
3396 if (c1== CR && !fold_preserve_f) {
3397 fold_state=0; /* ignore cr */
3398 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3400 fold_state=0; /* ignore cr */
3401 } else if (c1== BS) {
3402 if (f_line>0) f_line--;
3404 } else if (c2==EOF && f_line != 0) { /* close open last line */
3406 } else if ((c1==LF && !fold_preserve_f)
3407 || ((c1==CR||(c1==LF&&f_prev!=CR))
3408 && fold_preserve_f)) {
3410 if (fold_preserve_f) {
3414 } else if ((f_prev == c1 && !fold_preserve_f)
3415 || (f_prev == LF && fold_preserve_f)
3416 ) { /* duplicate newline */
3419 fold_state = LF; /* output two newline */
3425 if (f_prev&0x80) { /* Japanese? */
3427 fold_state = 0; /* ignore given single newline */
3428 } else if (f_prev==SP) {
3432 if (++f_line<=fold_len)
3436 fold_state = CR; /* fold and output nothing */
3440 } else if (c1=='\f') {
3443 fold_state = LF; /* output newline and clear */
3444 } else if ( (c2==0 && c1==SP)||
3445 (c2==0 && c1==TAB)||
3446 (c2=='!'&& c1=='!')) {
3447 /* X0208 kankaku or ascii space */
3449 fold_state = 0; /* remove duplicate spaces */
3452 if (++f_line<=fold_len)
3453 fold_state = SP; /* output ASCII space only */
3455 f_prev = SP; f_line = 0;
3456 fold_state = CR; /* fold and output nothing */
3460 prev0 = f_prev; /* we still need this one... , but almost done */
3462 if (c2 || c2 == JIS_X_0201_1976_K)
3463 f_prev |= 0x80; /* this is Japanese */
3464 f_line += char_size(c2,c1);
3465 if (f_line<=fold_len) { /* normal case */
3468 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3469 f_line = char_size(c2,c1);
3470 fold_state = LF; /* We can't wait, do fold now */
3471 } else if (c2 == JIS_X_0201_1976_K) {
3472 /* simple kinsoku rules return 1 means no folding */
3473 if (c1==(0xde&0x7f)) fold_state = 1; /*
\e$B!+
\e(B*/
3474 else if (c1==(0xdf&0x7f)) fold_state = 1; /*
\e$B!,
\e(B*/
3475 else if (c1==(0xa4&0x7f)) fold_state = 1; /*
\e$B!#
\e(B*/
3476 else if (c1==(0xa3&0x7f)) fold_state = 1; /*
\e$B!$
\e(B*/
3477 else if (c1==(0xa1&0x7f)) fold_state = 1; /*
\e$B!W
\e(B*/
3478 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3479 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3481 fold_state = LF;/* add one new f_line before this character */
3484 fold_state = LF;/* add one new f_line before this character */
3487 /* kinsoku point in ASCII */
3488 if ( c1==')'|| /* { [ ( */
3499 /* just after special */
3500 } else if (!is_alnum(prev0)) {
3501 f_line = char_size(c2,c1);
3503 } else if ((prev0==SP) || /* ignored new f_line */
3504 (prev0==LF)|| /* ignored new f_line */
3505 (prev0&0x80)) { /* X0208 - ASCII */
3506 f_line = char_size(c2,c1);
3507 fold_state = LF;/* add one new f_line before this character */
3509 fold_state = 1; /* default no fold in ASCII */
3513 if (c1=='"') fold_state = 1; /*
\e$B!"
\e(B */
3514 else if (c1=='#') fold_state = 1; /*
\e$B!#
\e(B */
3515 else if (c1=='W') fold_state = 1; /*
\e$B!W
\e(B */
3516 else if (c1=='K') fold_state = 1; /*
\e$B!K
\e(B */
3517 else if (c1=='$') fold_state = 1; /*
\e$B!$
\e(B */
3518 else if (c1=='%') fold_state = 1; /*
\e$B!%
\e(B */
3519 else if (c1=='\'') fold_state = 1; /*
\e$B!\
\e(B */
3520 else if (c1=='(') fold_state = 1; /*
\e$B!(
\e(B */
3521 else if (c1==')') fold_state = 1; /*
\e$B!)
\e(B */
3522 else if (c1=='*') fold_state = 1; /*
\e$B!*
\e(B */
3523 else if (c1=='+') fold_state = 1; /*
\e$B!+
\e(B */
3524 else if (c1==',') fold_state = 1; /*
\e$B!,
\e(B */
3525 /* default no fold in kinsoku */
3528 f_line = char_size(c2,c1);
3529 /* add one new f_line before this character */
3532 f_line = char_size(c2,c1);
3534 /* add one new f_line before this character */
3539 /* terminator process */
3540 switch(fold_state) {
3542 OCONV_NEWLINE((*o_fconv));
3548 OCONV_NEWLINE((*o_fconv));
3559 static nkf_char z_prev2=0,z_prev1=0;
3562 z_conv(nkf_char c2, nkf_char c1)
3565 /* if (c2) c1 &= 0x7f; assertion */
3567 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
3573 if (z_prev2 == JIS_X_0201_1976_K) {
3574 if (c2 == JIS_X_0201_1976_K) {
3575 if (c1 == (0xde&0x7f)) { /*
\e$BByE@
\e(B */
3577 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
3579 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /*
\e$BH>ByE@
\e(B */
3581 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
3586 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
3588 if (c2 == JIS_X_0201_1976_K) {
3589 if (dv[(c1-SP)*2] || ev[(c1-SP)*2]) {
3590 /* wait for
\e$BByE@
\e(B or
\e$BH>ByE@
\e(B */
3595 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
3606 if (alpha_f&1 && c2 == 0x23) {
3607 /* JISX0208 Alphabet */
3609 } else if (c2 == 0x21) {
3610 /* JISX0208 Kigou */
3615 } else if (alpha_f&4) {
3620 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
3626 if (alpha_f&8 && c2 == 0) {
3628 const char *entity = 0;
3630 case '>': entity = ">"; break;
3631 case '<': entity = "<"; break;
3632 case '\"': entity = """; break;
3633 case '&': entity = "&"; break;
3636 while (*entity) (*o_zconv)(0, *entity++);
3642 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
3647 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
3651 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
3655 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
3659 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
3663 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
3667 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
3671 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
3675 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
3680 (*o_zconv)(JIS_X_0201_1976_K, c);
3683 } else if (c2 == 0x25) {
3684 /* JISX0208 Katakana */
3685 static const int fullwidth_to_halfwidth[] =
3687 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
3688 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
3689 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
3690 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
3691 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
3692 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
3693 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
3694 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
3695 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
3696 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
3697 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x0000,
3698 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
3700 if (fullwidth_to_halfwidth[c1-0x20]){
3701 c2 = fullwidth_to_halfwidth[c1-0x20];
3702 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
3704 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
3714 #define rot13(c) ( \
3716 (c <= 'M') ? (c + 13): \
3717 (c <= 'Z') ? (c - 13): \
3719 (c <= 'm') ? (c + 13): \
3720 (c <= 'z') ? (c - 13): \
3724 #define rot47(c) ( \
3726 ( c <= 'O') ? (c + 47) : \
3727 ( c <= '~') ? (c - 47) : \
3732 rot_conv(nkf_char c2, nkf_char c1)
3734 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
3740 (*o_rot_conv)(c2,c1);
3744 hira_conv(nkf_char c2, nkf_char c1)
3748 if (0x20 < c1 && c1 < 0x74) {
3750 (*o_hira_conv)(c2,c1);
3752 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
3754 c1 = nkf_char_unicode_new(0x3094);
3755 (*o_hira_conv)(c2,c1);
3758 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
3760 (*o_hira_conv)(c2,c1);
3765 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
3768 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
3770 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
3774 (*o_hira_conv)(c2,c1);
3779 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
3781 #define RANGE_NUM_MAX 18
3782 static const nkf_char range[RANGE_NUM_MAX][2] = {
3803 nkf_char start, end, c;
3805 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
3809 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
3814 for (i = 0; i < RANGE_NUM_MAX; i++) {
3815 start = range[i][0];
3818 if (c >= start && c <= end) {
3823 (*o_iso2022jp_check_conv)(c2,c1);
3827 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
3829 static const unsigned char *mime_pattern[] = {
3830 (const unsigned char *)"\075?EUC-JP?B?",
3831 (const unsigned char *)"\075?SHIFT_JIS?B?",
3832 (const unsigned char *)"\075?ISO-8859-1?Q?",
3833 (const unsigned char *)"\075?ISO-8859-1?B?",
3834 (const unsigned char *)"\075?ISO-2022-JP?B?",
3835 (const unsigned char *)"\075?ISO-2022-JP?Q?",
3836 #if defined(UTF8_INPUT_ENABLE)
3837 (const unsigned char *)"\075?UTF-8?B?",
3838 (const unsigned char *)"\075?UTF-8?Q?",
3840 (const unsigned char *)"\075?US-ASCII?Q?",
3845 /*
\e$B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u
\e(B */
3846 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
3847 e_iconv, s_iconv, 0, 0, 0, 0,
3848 #if defined(UTF8_INPUT_ENABLE)
3854 static const nkf_char mime_encode[] = {
3855 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3856 #if defined(UTF8_INPUT_ENABLE)
3863 static const nkf_char mime_encode_method[] = {
3864 'B', 'B','Q', 'B', 'B', 'Q',
3865 #if defined(UTF8_INPUT_ENABLE)
3873 /* MIME preprocessor fifo */
3875 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
3876 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
3877 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
3879 unsigned char buf[MIME_BUF_SIZE];
3881 unsigned int last; /* decoded */
3882 unsigned int input; /* undecoded */
3884 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
3886 #define MAXRECOVER 20
3889 mime_input_buf_unshift(nkf_char c)
3891 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
3895 mime_ungetc(nkf_char c, FILE *f)
3897 mime_input_buf_unshift(c);
3902 mime_ungetc_buf(nkf_char c, FILE *f)
3905 (*i_mungetc_buf)(c,f);
3907 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
3912 mime_getc_buf(FILE *f)
3914 /* we don't keep eof of mime_input_buf, becase it contains ?= as
3915 a terminator. It was checked in mime_integrity. */
3916 return ((mimebuf_f)?
3917 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
3921 switch_mime_getc(void)
3923 if (i_getc!=mime_getc) {
3924 i_mgetc = i_getc; i_getc = mime_getc;
3925 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
3926 if(mime_f==STRICT_MIME) {
3927 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
3928 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
3934 unswitch_mime_getc(void)
3936 if(mime_f==STRICT_MIME) {
3937 i_mgetc = i_mgetc_buf;
3938 i_mungetc = i_mungetc_buf;
3941 i_ungetc = i_mungetc;
3942 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
3943 mime_iconv_back = NULL;
3947 mime_integrity(FILE *f, const unsigned char *p)
3951 /* In buffered mode, read until =? or NL or buffer full
3953 mime_input_state.input = mime_input_state.top;
3954 mime_input_state.last = mime_input_state.top;
3956 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
3958 q = mime_input_state.input;
3959 while((c=(*i_getc)(f))!=EOF) {
3960 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
3961 break; /* buffer full */
3963 if (c=='=' && d=='?') {
3964 /* checked. skip header, start decode */
3965 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3966 /* mime_last_input = mime_input_state.input; */
3967 mime_input_state.input = q;
3971 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
3973 /* Should we check length mod 4? */
3974 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3977 /* In case of Incomplete MIME, no MIME decode */
3978 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
3979 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
3980 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
3981 switch_mime_getc(); /* anyway we need buffered getc */
3986 mime_begin_strict(FILE *f)
3990 const unsigned char *p,*q;
3991 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
3993 mime_decode_mode = FALSE;
3994 /* =? has been checked */
3996 p = mime_pattern[j];
3999 for(i=2;p[i]>SP;i++) { /* start at =? */
4000 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4001 /* pattern fails, try next one */
4003 while (mime_pattern[++j]) {
4004 p = mime_pattern[j];
4005 for(k=2;k<i;k++) /* assume length(p) > i */
4006 if (p[k]!=q[k]) break;
4007 if (k==i && nkf_toupper(c1)==p[k]) break;
4009 p = mime_pattern[j];
4010 if (p) continue; /* found next one, continue */
4011 /* all fails, output from recovery buffer */
4019 mime_decode_mode = p[i-2];
4021 mime_iconv_back = iconv;
4022 set_iconv(FALSE, mime_priority_func[j]);
4023 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4025 if (mime_decode_mode=='B') {
4026 mimebuf_f = unbuf_f;
4028 /* do MIME integrity check */
4029 return mime_integrity(f,mime_pattern[j]);
4043 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4044 /* re-read and convert again from mime_buffer. */
4046 /* =? has been checked */
4047 k = mime_input_state.last;
4048 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4049 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4050 /* We accept any character type even if it is breaked by new lines */
4051 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4052 if (c1==LF||c1==SP||c1==CR||
4053 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4055 /* Failed. But this could be another MIME preemble */
4057 mime_input_state.last--;
4063 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4064 if (!(++i<MAXRECOVER) || c1==EOF) break;
4065 if (c1=='b'||c1=='B') {
4066 mime_decode_mode = 'B';
4067 } else if (c1=='q'||c1=='Q') {
4068 mime_decode_mode = 'Q';
4072 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4073 if (!(++i<MAXRECOVER) || c1==EOF) break;
4075 mime_decode_mode = FALSE;
4081 if (!mime_decode_mode) {
4082 /* false MIME premble, restart from mime_buffer */
4083 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4084 /* Since we are in MIME mode until buffer becomes empty, */
4085 /* we never go into mime_begin again for a while. */
4088 /* discard mime preemble, and goto MIME mode */
4089 mime_input_state.last = k;
4090 /* do no MIME integrity check */
4091 return c1; /* used only for checking EOF */
4102 debug(const char *str)
4105 fprintf(stderr, "%s\n", str ? str : "NULL");
4111 set_input_codename(const char *codename)
4113 if (!input_codename) {
4114 input_codename = codename;
4115 } else if (strcmp(codename, input_codename) != 0) {
4116 input_codename = "";
4121 get_guessed_code(void)
4123 if (input_codename && !*input_codename) {
4124 input_codename = "BINARY";
4126 struct input_code *p = find_inputcode_byfunc(iconv);
4127 if (!input_codename) {
4128 input_codename = "ASCII";
4129 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4130 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4131 input_codename = "CP932";
4132 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4133 if (p->score & (SCORE_X0212))
4134 input_codename = "EUCJP-MS";
4135 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4136 input_codename = "CP51932";
4137 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4138 if (p->score & (SCORE_KANA))
4139 input_codename = "CP50221";
4140 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4141 input_codename = "CP50220";
4144 return input_codename;
4147 #if !defined(PERL_XS) && !defined(WIN32DLL)
4149 print_guessed_code(char *filename)
4151 if (filename != NULL) printf("%s: ", filename);
4152 if (input_codename && !*input_codename) {
4155 input_codename = get_guessed_code();
4157 printf("%s\n", input_codename);
4161 input_eol == CR ? " (CR)" :
4162 input_eol == LF ? " (LF)" :
4163 input_eol == CRLF ? " (CRLF)" :
4164 input_eol == EOF ? " (MIXED NL)" :
4174 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4176 nkf_char c1, c2, c3;
4182 if (!nkf_isxdigit(c2)){
4187 if (!nkf_isxdigit(c3)){
4192 return (hex2bin(c2) << 4) | hex2bin(c3);
4198 return hex_getc(':', f, i_cgetc, i_cungetc);
4202 cap_ungetc(nkf_char c, FILE *f)
4204 return (*i_cungetc)(c, f);
4210 return hex_getc('%', f, i_ugetc, i_uungetc);
4214 url_ungetc(nkf_char c, FILE *f)
4216 return (*i_uungetc)(c, f);
4220 #ifdef NUMCHAR_OPTION
4222 numchar_getc(FILE *f)
4224 nkf_char (*g)(FILE *) = i_ngetc;
4225 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4236 if (buf[i] == 'x' || buf[i] == 'X'){
4237 for (j = 0; j < 7; j++){
4239 if (!nkf_isxdigit(buf[i])){
4246 c |= hex2bin(buf[i]);
4249 for (j = 0; j < 8; j++){
4253 if (!nkf_isdigit(buf[i])){
4260 c += hex2bin(buf[i]);
4266 return nkf_char_unicode_new(c);
4276 numchar_ungetc(nkf_char c, FILE *f)
4278 return (*i_nungetc)(c, f);
4282 #ifdef UNICODE_NORMALIZATION
4287 nkf_char (*g)(FILE *f) = i_nfc_getc;
4288 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4289 nkf_buf_t *buf = nkf_buf_new(9);
4290 const unsigned char *array;
4291 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4292 nkf_char c = (*g)(f);
4294 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4296 nkf_buf_push(buf, (unsigned char)c);
4298 while (lower <= upper) {
4299 int mid = (lower+upper) / 2;
4301 array = normalization_table[mid].nfd;
4302 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4303 if (len >= nkf_buf_length(buf)) {
4307 lower = 1, upper = 0;
4310 nkf_buf_push(buf, c);
4312 if (array[len] != nkf_buf_at(buf, len)) {
4313 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4314 else upper = mid - 1;
4321 array = normalization_table[mid].nfc;
4323 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4324 nkf_buf_push(buf, array[i]);
4328 } while (lower <= upper);
4330 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4331 c = nkf_buf_pop(buf);
4332 nkf_buf_dispose(buf);
4338 nfc_ungetc(nkf_char c, FILE *f)
4340 return (*i_nfc_ungetc)(c, f);
4342 #endif /* UNICODE_NORMALIZATION */
4346 base64decode(nkf_char c)
4351 i = c - 'A'; /* A..Z 0-25 */
4352 } else if (c == '_') {
4353 i = '?' /* 63 */ ; /* _ 63 */
4355 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4357 } else if (c > '/') {
4358 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4359 } else if (c == '+' || c == '-') {
4360 i = '>' /* 62 */ ; /* + and - 62 */
4362 i = '?' /* 63 */ ; /* / 63 */
4370 nkf_char c1, c2, c3, c4, cc;
4371 nkf_char t1, t2, t3, t4, mode, exit_mode;
4372 nkf_char lwsp_count;
4375 nkf_char lwsp_size = 128;
4377 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4378 return mime_input_buf(mime_input_state.top++);
4380 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4381 mime_decode_mode=FALSE;
4382 unswitch_mime_getc();
4383 return (*i_getc)(f);
4386 if (mimebuf_f == FIXED_MIME)
4387 exit_mode = mime_decode_mode;
4390 if (mime_decode_mode == 'Q') {
4391 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4393 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4394 if (c1<=SP || DEL<=c1) {
4395 mime_decode_mode = exit_mode; /* prepare for quit */
4398 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4402 mime_decode_mode = exit_mode; /* prepare for quit */
4403 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4404 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4405 /* end Q encoding */
4406 input_mode = exit_mode;
4408 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4409 while ((c1=(*i_getc)(f))!=EOF) {
4414 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4422 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4423 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4438 lwsp_buf[lwsp_count] = (unsigned char)c1;
4439 if (lwsp_count++>lwsp_size){
4441 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4442 lwsp_buf = lwsp_buf_new;
4448 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4450 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4451 i_ungetc(lwsp_buf[lwsp_count],f);
4454 nkf_xfree(lwsp_buf);
4457 if (c1=='='&&c2<SP) { /* this is soft wrap */
4458 while((c1 = (*i_mgetc)(f)) <=SP) {
4459 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4461 mime_decode_mode = 'Q'; /* still in MIME */
4462 goto restart_mime_q;
4465 mime_decode_mode = 'Q'; /* still in MIME */
4469 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4470 if (c2<=SP) return c2;
4471 mime_decode_mode = 'Q'; /* still in MIME */
4472 return ((hex2bin(c2)<<4) + hex2bin(c3));
4475 if (mime_decode_mode != 'B') {
4476 mime_decode_mode = FALSE;
4477 return (*i_mgetc)(f);
4481 /* Base64 encoding */
4483 MIME allows line break in the middle of
4484 Base64, but we are very pessimistic in decoding
4485 in unbuf mode because MIME encoded code may broken by
4486 less or editor's control sequence (such as ESC-[-K in unbuffered
4487 mode. ignore incomplete MIME.
4489 mode = mime_decode_mode;
4490 mime_decode_mode = exit_mode; /* prepare for quit */
4492 while ((c1 = (*i_mgetc)(f))<=SP) {
4497 if ((c2 = (*i_mgetc)(f))<=SP) {
4500 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4501 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4504 if ((c1 == '?') && (c2 == '=')) {
4507 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4508 while ((c1=(*i_getc)(f))!=EOF) {
4513 if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {
4521 if ((c1=(*i_getc)(f))!=EOF) {
4525 } else if ((c1=(*i_getc)(f))!=EOF && (c1==SP||c1==TAB)) {