NKF.mod/NKF.pm

   1 # Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
   2 # Copyright (c) 1996-2010, The nkf Project.
   3 # All rights reserved.
   4 #
   5 # This software is provided 'as-is', without any express or implied
   6 # warranty. In no event will the authors be held liable for any damages
   7 # arising from the use of this software.
   8 #
   9 # Permission is granted to anyone to use this software for any purpose,
  10 # including commercial applications, and to alter it and redistribute it
  11 # freely, subject to the following restrictions:
  12 #
  13 # 1. The origin of this software must not be misrepresented; you must not
  14 # claim that you wrote the original software. If you use this software
  15 # in a product, an acknowledgment in the product documentation would be
  16 # appreciated but is not required.
  17 #
  18 # 2. Altered source versions must be plainly marked as such, and must not be
  19 # misrepresented as being the original software.
  20 #
  21 # 3. This notice may not be removed or altered from any source distribution.
  22
  23 package NKF;
  24
  25 use strict;
  26 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  27
  28 require Exporter;
  29 require DynaLoader;
  30
  31 @ISA = qw(Exporter DynaLoader);
  32 # Items to export into callers namespace by default. Note: do not export
  33 # names by default without a very good reason. Use EXPORT_OK instead.
  34 # Do not simply export all your public functions/methods/constants.
  35 @EXPORT = qw(
  36         nkf     nkf_continue    inputcode
  37 );
  38 $VERSION = '2.13';
  39
  40 bootstrap NKF $VERSION;
  41
  42 # Preloaded methods go here.
  43
  44 # Autoload methods go after =cut, and are processed by the autosplit program.
  45
  46 1;
  47 __END__
  48
  49 #
  50 # =begin ¤«¤é =begin COMMAND ¤Þ¤Ç¤Ï Perl/NKF ¤Î¥É¥¥å¥á¥ó¥È
  51 # =begin COMMAND ¤«¤é =end ¤Þ¤Ç¤Ï nkf ¥³¥Þ¥ó¥É¤Î¥É¥¥å¥á¥ó¥È
  52 #
  53
  54 =head1 NAME
  55
  56 =begin
  57
  58 NKF - Perl extension for Network Kanji Filter
  59
  60 =begin COMMAND
  61
  62 nkf - Network Kanji Filter
  63
  64 =end
  65
  66 =head1 SYNOPSIS
  67
  68 =begin
  69
  70   use NKF;
  71   $output = nkf("-s",$input);
  72
  73 =begin COMMAND
  74
  75 nkf B<[-butjnesliohrTVvwWJESZxXFfmMBOcdILg]> B<[>I<file ...>B<]>
  76
  77 =end
  78
  79 =head1 DESCRIPTION
  80
  81 =begin
  82
  83 This is a Perl Extension version of nkf (Network Kanji Filter).
  84 It converts the last argument and return converted result. Conversion
  85 details are specified by flags before the last argument.
  86
  87 =end
  88
  89 B<Nkf> is a yet another kanji code converter among networks, hosts and terminals.
  90 It converts input kanji code to designated kanji code
  91 such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8, UTF-16 or UTF-32.
  92
  93 One of the most unique faculty of B<nkf> is the guess of the input kanji encodings.
  94 It currently recognizes ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8, UTF-16 and UTF-32.
  95 So users needn't set the input kanji code explicitly.
  96
  97 By default, X0201 kana is converted into X0208 kana.
  98 For X0201 kana, SO/SI, SSO and ESC-(-I methods are supported.
  99 For automatic code detection, nkf assumes no X0201 kana in Shift_JIS.
 100 To accept X0201 in Shift_JIS, use B<-X>, B<-x> or B<-S>.
 101
 102 =head1 OPTIONS
 103
 104 =over
 105
 106 =item B<-J -S -E -W -W16 -W32 -j -s -e -w -w16 -w32>
 107
 108 Specify input and output encodings. Upper case is input.
 109 cf. --ic and --oc.
 110
 111 =over
 112
 113 =item B<-J>
 114
 115 ISO-2022-JP (JIS code).
 116
 117 =item B<-S>
 118
 119 Shift_JIS and JIS X 0201 kana.
 120 EUC-JP is recognized as X0201 kana. Without B<-x> flag,
 121 JIS X 0201 Katakana (a.k.a.halfwidth kana) is converted into JIS X 0208.
 122 If you use Windows, see Windows-31J (CP932).
 123
 124 =item B<-E>
 125
 126 EUC-JP.
 127
 128 =item B<-W>
 129
 130 UTF-8N.
 131
 132 =item B<-W16[BL][0]>
 133
 134 UTF-16.
 135 B or L gives whether Big Endian or Little Endian.
 136 0 gives whther put BOM or not.
 137
 138 =item B<-W32[BL][0]>
 139
 140 UTF-32.
 141 B or L gives whether Big Endian or Little Endian.
 142 0 gives whther put BOM or not.
 143
 144 =back
 145
 146 =item B<-b -u>
 147
 148 Output is buffered (DEFAULT), Output is unbuffered.
 149
 150 =item B<-t>
 151
 152 No conversion.
 153
 154 =item B<-i[@B]>
 155
 156 Specify the escape sequence for JIS X 0208.
 157
 158 =over
 159
 160 =item B<-i@>
 161
 162 Use ESC ( @. (JIS X 0208-1978)
 163
 164 =item B<-iB>
 165
 166 Use ESC ( B. (JIS X 0208-1983/1990 DEFAULT)
 167
 168 =back
 169
 170 =item B<-o[BJ]>
 171
 172 Specify the escape sequence for US-ASCII/JIS X 0201 Roman. (DEFAULT B)
 173
 174 =item B<-r>
 175
 176 {de/en}crypt ROT13/47
 177
 178 =item B<-h[123] --hiragana --katakana --katakana-hiragana>
 179
 180 =over
 181
 182 =item B<-h1 --hiragana>
 183
 184 Katakana to Hiragana conversion.
 185
 186 =item B<-h2 --katakana>
 187
 188 Hiragana to Katakana conversion.
 189
 190 =item B<-h3 --katakana-hiragana>
 191
 192 Katakana to Hiragana and Hiragana to Katakana conversion.
 193
 194 =back
 195
 196 =item B<-T>
 197
 198 Text mode output (MS-DOS)
 199
 200 =item B<-f[I<m> [- I<n>]]>
 201
 202 Folding on I<m> length with I<n> margin in a line.
 203 Without this option, fold length is 60 and fold margin is 10.
 204
 205 =item B<-F>
 206
 207 New line preserving line folding.
 208
 209 =item B<-Z[0-3]>
 210
 211 Convert X0208 alphabet (Fullwidth Alphabets) to ASCII.
 212
 213 =over
 214
 215 =item B<-Z -Z0>
 216
 217 Convert X0208 alphabet to ASCII.
 218
 219 =item B<-Z1>
 220
 221 Convert X0208 kankaku to single ASCII space.
 222
 223 =item B<-Z2>
 224
 225 Convert X0208 kankaku to double ASCII spaces.
 226
 227 =item B<-Z3>
 228
 229 Replacing fullwidth >, <, ", & into '&gt;', '&lt;', '&quot;', '&amp;' as in HTML.
 230
 231 =back
 232
 233 =item B<-X -x>
 234
 235 With B<-X> or without this option, X0201 is converted into X0208 Kana.
 236 With B<-x>, try to preserve X0208 kana and do not convert X0201 kana to X0208.
 237 In JIS output, ESC-(-I is used. In EUC output, SS2 is used.
 238
 239 =item B<-B[0-2]>
 240
 241 Assume broken JIS-Kanji input, which lost ESC.
 242 Useful when your site is using old B-News Nihongo patch.
 243
 244 =over
 245
 246 =item B<-B1>
 247
 248 allows any chars after ESC-( or ESC-$.
 249
 250 =item B<-B2>
 251
 252 force ASCII after NL.
 253
 254 =back
 255
 256 =item B<-I>
 257
 258 Replacing non iso-2022-jp char into a geta character
 259 (substitute character in Japanese).
 260
 261 =item B<-m[BQN0]>
 262
 263 MIME ISO-2022-JP/ISO8859-1 decode. (DEFAULT)
 264 To see ISO8859-1 (Latin-1) -l is necessary.
 265
 266 =over
 267
 268 =item B<-mB>
 269
 270 Decode MIME base64 encoded stream. Remove header or other part before
 271 conversion.
 272
 273 =item B<-mQ>
 274
 275 Decode MIME quoted stream. '_' in quoted stream is converted to space.
 276
 277 =item B<-mN>
 278
 279 Non-strict decoding.
 280 It allows line break in the middle of the base64 encoding.
 281
 282 =item B<-m0>
 283
 284 No MIME decode.
 285
 286 =back
 287
 288 =item B<-M>
 289
 290 MIME encode. Header style. All ASCII code and control characters are intact.
 291
 292 =over
 293
 294 =item B<-MB>
 295
 296 MIME encode Base64 stream.
 297 Kanji conversion is performed before encoding, so this cannot be used as a picture encoder.
 298
 299 =item B<-MQ>
 300
 301 Perform quoted encoding.
 302
 303 =back
 304
 305 =item B<-l>
 306
 307 Input and output code is ISO8859-1 (Latin-1) and ISO-2022-JP.
 308 B<-s>, B<-e> and B<-x> are not compatible with this option.
 309
 310 =item B<-L[uwm] -d -c>
 311
 312 Convert line breaks.
 313
 314 =over
 315
 316 =item B<-Lu -d>
 317
 318 unix (LF)
 319
 320 =item B<-Lw -c>
 321
 322 windows (CRLF)
 323
 324 =item B<-Lm>
 325
 326 mac (CR)
 327
 328 Without this option, nkf doesn't convert line breaks.
 329
 330 =back
 331
 332 =item B<--fj --unix --mac --msdos --windows>
 333
 334 Convert for these systems.
 335
 336 =item B<--jis --euc --sjis --mime --base64>
 337
 338 Convert to named code.
 339
 340 =item B<--jis-input --euc-input --sjis-input --mime-input --base64-input>
 341
 342 Assume input system
 343
 344 =item B<--ic=I<input codeset> --oc=I<output codeset>>
 345
 346 Set the input or output codeset.
 347 NKF supports following codesets and those codeset names are case insensitive.
 348
 349 =over
 350
 351 =item ISO-2022-JP
 352
 353 a.k.a. RFC1468, 7bit JIS, JUNET
 354
 355 =item EUC-JP (eucJP-nkf)
 356
 357 a.k.a. AT&T JIS, Japanese EUC, UJIS
 358
 359 =item eucJP-ascii
 360
 361 =item eucJP-ms
 362
 363 =item CP51932
 364
 365 Microsoft Version of EUC-JP.
 366
 367 =item Shift_JIS
 368
 369 a.k.a. SJIS, MS_Kanji
 370
 371 =item Windows-31J
 372
 373 a.k.a. CP932
 374
 375 =item UTF-8
 376
 377 same as UTF-8N
 378
 379 =item UTF-8N
 380
 381 UTF-8 without BOM
 382
 383 =item UTF-8-BOM
 384
 385 UTF-8 with BOM
 386
 387 =item UTF8-MAC (input only)
 388
 389 decomposed UTF-8
 390
 391 =item UTF-16
 392
 393 same as UTF-16BE
 394
 395 =item UTF-16BE
 396
 397 UTF-16 Big Endian without BOM
 398
 399 =item UTF-16BE-BOM
 400
 401 UTF-16 Big Endian with BOM
 402
 403 =item UTF-16LE
 404
 405 UTF-16 Little Endian without BOM
 406
 407 =item UTF-16LE-BOM
 408
 409 UTF-16 Little Endian with BOM
 410
 411 =item UTF-32
 412
 413 same as UTF-32BE
 414
 415 =item UTF-32BE
 416
 417 UTF-32 Big Endian without BOM
 418
 419 =item UTF-32BE-BOM
 420
 421 UTF-32 Big Endian with BOM
 422
 423 =item UTF-32LE
 424
 425 UTF-32 Little Endian without BOM
 426
 427 =item UTF-32LE-BOM
 428
 429 UTF-32 Little Endian with BOM
 430
 431 =back
 432
 433 =item B<--fb-{skip, html, xml, perl, java, subchar}>
 434
 435 Specify the way that nkf handles unassigned characters.
 436 Without this option, --fb-skip is assumed.
 437
 438 =item B<--prefix=I<escape character>I<target character>..>
 439
 440 When nkf converts to Shift_JIS,
 441 nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
 442 1st byte of argument is the escape character and following bytes are target characters.
 443
 444 =item B<--no-cp932ext>
 445
 446 Handle the characters extended in CP932 as unassigned characters.
 447
 448 =item B<--no-best-fit-chars>
 449
 450 When Unicode to Encoded byte conversion,
 451 don't convert characters which is not round trip safe.
 452 When Unicode to Unicode conversion,
 453 with this and -x option, nkf can be used as UTF converter.
 454 (In other words, without this and -x option, nkf doesn't save some characters)
 455
 456 When nkf converts strings that related to path, you should use this opion.
 457
 458 =item B<--cap-input>
 459
 460 Decode hex encoded characters.
 461
 462 =item B<--url-input>
 463
 464 Unescape percent escaped characters.
 465
 466 =item B<--numchar-input>
 467
 468 Decode character reference, such as "&#....;".
 469
 470 =begin COMMAND
 471
 472 =item B<--in-place[=>I<SUFFIX>B<]>  B<--overwrite[=>I<SUFFIX>B<]>
 473
 474 Overwrite B<original> listed files by filtered result.
 475
 476 B<Note> --overwrite preserves timestamps of original files.
 477
 478 =item B<--guess=[12]>
 479
 480 Print guessed encoding and newline. (2 is default, 1 is only encoding)
 481
 482 =item B<--help>
 483
 484 Print nkf's help.
 485
 486 =item B<--version>
 487
 488 Print nkf's version.
 489
 490 =end
 491
 492 =item B<-->
 493
 494 Ignore rest of -option.
 495
 496 =back
 497
 498 =head1 AUTHOR
 499
 500 Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
 501
 502 Copyright (c) 1996-2013, The nkf Project.
 503
 504 =begin
 505
 506 =head1 SEE ALSO
 507
 508 perl(1).   nkf(1)
 509
 510 =end
 511
 512 =cut