OSDN Git Service

first commit
[slackware/slackbuild.git] / stardict-tools / stardict-tools-3.0.1 / src / ncce2stardict.pl
1 #!/usr/bin/perl -w
2 # by ChaosLawful@SMTH at 2006-3-23
3 use strict;
4 use Encode;
5 use FileHandle;
6 use Fcntl ':seek';      # for constants SEEK_*
7
8 my @ncce_dicts=("ec","ce");
9
10 for my $ncce_dict (@ncce_dicts) {
11         my $buf;
12         my $idxFH=new FileHandle("$ncce_dict.idx") or die;
13         my $libFH=new FileHandle("$ncce_dict.lib") or die;
14         my $outFH=new FileHandle(">ncce_$ncce_dict.tab") or die;
15         binmode($idxFH);
16         binmode($libFH);
17         binmode($outFH);
18
19         sysread($idxFH,$buf,4);
20         my ($totalRecord)=unpack("L",$buf);     # got total record number
21
22         for my $idxNo (1..$totalRecord) {
23                 seek($idxFH,$idxNo*4,SEEK_SET); # find offset
24                 sysread($idxFH,$buf,4);
25
26                 my ($off)=unpack("L",$buf);     # seek into lib file
27                 seek($libFH,$off,SEEK_SET);
28                 {
29                         # and read corresponding entry record
30                         local $/=chr(1);
31                         chomp($buf=<$libFH>);
32                 }
33
34                 $buf=pack "C*",map $_+0x1e,unpack "C*",$buf;    # decrypt record
35
36                 $buf=~s/\\/\\\\/gs;
37                 my @fields=split(/\x1e/,$buf);  # split entry into word and explanation
38                 toTextDict($outFH,$fields[0],$fields[1]);       # output
39         }
40 }
41
42 sub toTextDict
43 {
44         my ($fh,$word,$explain)=@_;
45         # Kingsoft custom dictionary's export format:
46         # every line contains one entry, whose format is:
47         # <word>|<explanation>
48         # where <explanation>:=<literal>[\r\n<explanation>]
49
50         # strip leading and trailing spaces, squeeze inner spaces
51         $word=~s/^\s+//gs;
52         $word=~y/ / /s;
53         $word=~s/\s+$//gs;
54
55         $explain=~s/^\s+//gs;
56         $explain=~y/ / /s;
57         $explain=~s/\s+$//gs;
58
59         # split NCCE entry into multiple explanations, optional
60         $explain=~s/;\s*/\\n/gs;
61         # convert fullwidth comma between alphadigits into halfwidth comma, optional
62         $word=~s/\xa3\xac(?=\w)/,/gs;
63         $explain=~s/\xa3\xac(?=\w)/,/gs;
64         # convert to utf-8
65         $word=encode("utf-8",decode("cp936",$word));
66         $explain=encode("utf-8",decode("cp936",$explain));
67         print $fh "$word\t$explain\n";
68 }
69