5 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
7 $Id: GikoBayesian.pas,v 1.14 2004/11/01 09:51:57 yoffy Exp $
10 //!
\95½
\89¼
\96¼
\82ð
\8e«
\8f\91\82É
\8aÜ
\82ß
\82È
\82¢
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
15 //==================================================
17 //==================================================
20 //==================================================
22 //==================================================
24 {!***********************************************************
25 \brief
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
26 ************************************************************}
27 TWordInfo = class( TObject )
29 FNormalWord : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
30 FImportantWord : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
31 FNormalText : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
32 FImportantText : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
35 property NormalWord : Integer read FNormalWord write FNormalWord;
36 property ImportantWord : Integer read FImportantWord write FImportantWord;
37 property NormalText : Integer read FNormalText write FNormalText;
38 property ImportantText : Integer read FImportantText write FImportantText;
41 {!***********************************************************
42 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
43 ************************************************************}
44 TWordCountInfo = class( TObject )
46 FWordCount : Integer; //!<
\92P
\8cê
\90\94
49 property WordCount : Integer read FWordCount write FWordCount;
52 {!***********************************************************
53 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83\8a\83X
\83g
54 ************************************************************}
55 // TWordCount = class( THashedStringList ) //
\8c\83\92x
56 TWordCount = class( TStringList )
59 destructor Destroy; override;
62 {!***********************************************************
63 \brief
\83t
\83B
\83\8b\83^
\83A
\83\8b\83S
\83\8a\83Y
\83\80
64 ************************************************************}
65 TGikoBayesianAlgorithm =
66 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
68 {!***********************************************************
69 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
70 ************************************************************}
71 // TGikoBayesian = class( THashedStringList ) //
\8c\83\92x
72 TGikoBayesian = class( TStringList )
74 FFilePath : string; //!<
\93Ç
\82Ý
\8d\9e\82ñ
\82¾
\83t
\83@
\83C
\83\8b\83p
\83X
75 function GetObject( const name : string ) : TWordInfo;
76 procedure SetObject( const name : string; value : TWordInfo );
80 destructor Destroy; override;
82 //!
\83t
\83@
\83C
\83\8b\82©
\82ç
\8aw
\8fK
\97\9a\97ð
\82ð
\93Ç
\82Ý
\8fo
\82µ
\82Ü
\82·
83 procedure LoadFromFile( const filePath : string );
85 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
86 procedure SaveToFile( const filePath : string );
88 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
91 //!
\92P
\8cê
\82É
\91Î
\82·
\82é
\8fî
\95ñ
\82ð
\8eæ
\93¾
\82µ
\82Ü
\82·
92 property Objects[ const name : string ] : TWordInfo
93 read GetObject write SetObject; default;
95 //!
\95¶
\8fÍ
\82É
\8aÜ
\82Ü
\82ê
\82é
\92P
\8cê
\82ð
\83J
\83E
\83\93\83g
\82µ
\82Ü
\82·
98 wordCount : TWordCount );
101 \brief Paul Graham
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
102 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
104 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
107 \brief GaryRobinson
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
108 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
110 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
113 \brief GaryRobinson-Fisher
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
114 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
116 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
119 \brief
\95¶
\8fÍ
\82ð
\89ð
\90Í
120 \param text
\89ð
\90Í
\82·
\82é
\95¶
\8fÍ
121 \param wordCount
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
\82ª
\95Ô
\82é
122 \param algorithm
\92\8d\96Ú
\93x
\82Ì
\8c\88\92è
\82É
\97p
\82¢
\82é
\83A
\83\8b\83S
\83\8a\83Y
\83\80\82ð
\8ew
\92è
\82µ
\82Ü
\82·
123 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
125 CountWord
\82Æ Calcxxxxx
\82ð
\82Ü
\82Æ
\82ß
\82Ä
\8eÀ
\8ds
\82·
\82é
\82¾
\82¯
\82Å
\82·
\81B
129 wordCount : TWordCount;
130 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
134 \brief
\8aw
\8fK
\82·
\82é
135 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
136 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82é
\82È
\82ç True
139 wordCount : TWordCount;
140 isImportant : Boolean );
143 \brief
\8aw
\8fK
\8c\8b\89Ê
\82ð
\96Y
\82ê
\82é
144 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
145 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82ç
\82ê
\82Ä
\82¢
\82½
\82È
\82ç True
146 \warning
\8aw
\8fK
\8dÏ
\82Ý
\82Ì
\95¶
\8fÍ
\82©
\82Ç
\82¤
\82©
\82Í
\8am
\94F
\8fo
\97\88\82Ü
\82¹
\82ñ
\81B<br>
147 Learn
\82µ
\82Ä
\82¢
\82È
\82¢
\95¶
\8fÍ
\82â isImportant
\82ª
\8aÔ
\88á
\82Á
\82Ä
\82¢
\82é
\95¶
\8fÍ
\82ð
148 Forget
\82·
\82é
\82Æ
\83f
\81[
\83^
\83x
\81[
\83X
\82ª
\94j
\91¹
\82µ
\82Ü
\82·
\81B<br>
149 \8aw
\8fK
\8dÏ
\82Ý
\82©
\82Ç
\82¤
\82©
\82Í
\93Æ
\8e©
\82É
\8aÇ
\97\9d\82µ
\82Ä
\82
\82¾
\82³
\82¢
\81B
151 \91S
\82Ä
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82ð
\83N
\83\8a\83A
\82·
\82é
\82í
\82¯
\82Å
\82Í
\82 \82è
\82Ü
\82¹
\82ñ
\81B<br>
152 wordCount
\82ð
\93¾
\82½
\95¶
\8fÍ (Parse
\82Ì text
\88ø
\90\94)
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82Ì
\82Ý
\83N
\83\8a\83A
\82µ
\82Ü
\82·
\81B<br><br>
154 \8eå
\82É
\92\8d\96Ú
\95¶
\8fÍ
\82Æ
\94ñ
\92\8d\96Ú
\95¶
\8fÍ
\82ð
\90Ø
\82è
\91Ö
\82¦
\82é
\82½
\82ß
\82É Forget -> Learn
\82Ì
\8f\87\82Å
\8eg
\97p
\82µ
\82Ü
\82·
\81B
157 wordCount : TWordCount;
158 isImportant : Boolean );
161 //==================================================
163 //==================================================
166 SysUtils, Math, Windows,
170 GIKO_BAYESIAN_FILE_VERSION = '1.0';
172 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173 ModeWGraph, ModeWAlpha, ModeWNum,
174 ModeWHira, ModeWKata, ModeWKanji);
176 CharMode1 : array [ 0..255 ] of Byte =
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
197 //************************************************************
199 //************************************************************
201 //==============================
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
208 p := AnsiPos(delimiter, s);
212 Result := Copy(s, 1, p - 1);
213 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
216 //==============================
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
224 v1 := Abs( Single( p1 ) - 0.5 );
225 v2 := Abs( Single( p2 ) - 0.5 );
235 //************************************************************
237 //************************************************************
238 constructor TWordCount.Create;
241 Duplicates := dupIgnore;
242 CaseSensitive := True;
247 destructor TWordCount.Destroy;
252 for i := Count - 1 downto 0 do
253 if Objects[ i ] <> nil then
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
264 //==============================
266 //==============================
267 constructor TGikoBayesian.Create;
270 Duplicates := dupIgnore;
271 CaseSensitive := True;
276 //==============================
278 //==============================
279 destructor TGikoBayesian.Destroy;
284 for i := Count - 1 downto 0 do
285 if inherited Objects[ i ] <> nil then
286 inherited Objects[ i ].Free;
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
301 FFilePath := filePath;
303 if not FileExists( filePath ) then
306 sl := TStringList.Create;
308 sl.LoadFromFile( filePath );
310 for i := 1 to sl.Count - 1 do begin
312 name := RemoveToken( s, #1 );
313 info := TWordInfo.Create;
314 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
319 AddObject( name, info );
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
335 FFilePath := filePath;
337 sl := TStringList.Create;
340 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
342 for i := 0 to Count - 1 do begin
343 info := TWordInfo( inherited Objects[ i ] );
344 s := Strings[ i ] + #1
345 + Format('%x', [info.NormalWord]) + #1
346 + Format('%x', [info.ImportantWord]) + #1
347 + Format('%x', [info.NormalText]) + #1
348 + Format('%x', [info.ImportantText]);
353 sl.SaveToFile( filePath );
360 procedure TGikoBayesian.Save;
363 if FFilePath <> '' then
364 SaveToFile( FFilePath );
368 //==============================
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
376 if Find( name, idx ) then
377 Result := TWordInfo( inherited Objects[ idx ] )
383 //==============================
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
391 if Find( name, idx ) then
392 inherited Objects[ idx ] := value
394 AddObject( name, value );
399 //==============================
401 //==============================
402 procedure TGikoBayesian.CountWord(
404 wordCount : TWordCount );
406 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407 ModeWGraph, ModeWAlpha, ModeWNum,
408 ModeWHira, ModeWKata, ModeWKanji);
410 p, tail, last : PChar;
411 mode, newMode : Modes;
414 wHiraDelimiter : TStringList;
415 wHiraFinalDelimiter : TStringList;
416 wKanjiDelimiter : TStringList;
419 countInfo : TWordCountInfo;
421 function cutBoth( _aWord : string; _delim : TStringList ) : string;
425 for _i := 0 to _delim.Count - 1 do begin
426 _aWord := CustomStringReplace(
429 #10 + _delim[ _i ] + #10, False );
434 function cutFirst( _aWord : string; _delim : TStringList ) : string;
438 for _i := 0 to _delim.Count - 1 do begin
439 _aWord := CustomStringReplace(
442 #10 + _delim[ _i ], False );
447 function cutFinal( _aWord : string; _delim : TStringList ) : string;
451 for _i := 0 to _delim.Count - 1 do begin
452 _aWord := CustomStringReplace(
455 _delim[ _i ] + #10, False );
460 procedure addWord( _dst : TWordCount; _words : TStringList );
464 _countInfo : TWordCountInfo;
466 for _i := 0 to _words.Count - 1 do begin
467 _aWord := _words[ _i ];
468 if Length( _aWord ) > 0 then begin
469 if _dst.Find( _aWord, _idx ) then begin
470 _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
472 _countInfo := TWordCountInfo.Create;
473 _dst.AddObject( _aWord, _countInfo );
475 _countInfo.WordCount := _countInfo.WordCount + 1;
480 function changeMode( _aWord : string; _mode : Modes ) : string;
484 _pWord, _pWord2 : PChar;
485 _pWordTail, _pFound : PChar;
487 _delim : string = #10;
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490 if mode = ModeWHira then begin
495 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
497 //
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
498 _aWord := CustomStringReplace( _aWord, ' ', '', False );
499 _aWord := CustomStringReplace( _aWord, '
\81@', '', False );
501 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
505 _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506 Result := cutBoth( _aWord, wHiraDelimiter );
511 //
\83f
\83\8a\83~
\83^
\82Å
\92P
\8cê
\95ª
\82¯
512 _aWord := cutBoth( _aWord, wKanjiDelimiter );
513 // 4 byte (2
\8e\9a)
\82¸
\82Â
\82Å
\92P
\8cê
\95ª
\82¯
514 _pWord := PChar( _aWord );
515 _i := Length( _aWord );
516 _pWordTail := _pWord + _i;
517 SetLength( _aWord2, _i + (_i shr 2) );
518 _pWord2 := PChar( _aWord2 );
520 while _pWord < _pWordTail do begin
521 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522 if _pFound = nil then
523 _pFound := _pWordTail;
524 _pFound := _pFound - 3;
526 while _pWord <= _pFound do begin
527 CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528 _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
530 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531 CopyMemory( _pWord2, _pWord, _i );
532 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
534 if _pWord < _pWordTail then begin
535 _i := _pWordTail - _pWord;
536 CopyMemory( _pWord2, _pWord, _i );
537 _pWord2 := _pWord2 + _i;
539 SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
552 WHIRA_DELIMITER = '
\82ð' + #10 + '
\82É' + #10 + '
\82ª' + #10 + '
\82Æ' + #10 + '
\82©
\82ç'
553 + #10 + '
\82Ö' + #10 + '
\82æ
\82è' + #10 + '
\82Ü
\82Å'+ #10 + '
\82Å'
554 + #10 + '
\82±
\82±' + #10 + '
\82»
\82±' + #10 + '
\82Ç
\82±'
555 + #10 + '
\82±
\82ê' + #10 + '
\82»
\82ê' + #10 + '
\82 \82ê' + #10 + '
\82Ç
\82ê'
556 + #10 + '
\82±
\82Ì' + #10 + '
\82»
\82Ì' + #10 + '
\82 \82Ì' + #10 + '
\82Ç
\82Ì'
557 + #10 + '
\82±
\82¤' + #10 + '
\82»
\82¤' + #10 + '
\82 \82 ' + #10 + '
\82Ç
\82¤'
558 + #10 + '
\82±
\82ñ
\82È' + #10 + '
\82»
\82ñ
\82È' + #10 + '
\82 \82ñ
\82È' + #10 + '
\82Ç
\82ñ
\82È'
559 + #10 + '
\82ê
\82½' + #10 + '
\82ê
\82Ä' + #10 + '
\82ê
\82ê' + #10 + '
\82ê
\82ë'
560 + #10 + '
\82ê
\82é' + #10 + '
\82ç
\82ê
\82é'
561 + #10 + '
\82Å
\82·' + #10 + '
\82Ü
\82·' + #10 + '
\82Ü
\82¹
\82ñ'
562 + #10 + '
\82Å
\82µ
\82½' + #10 + '
\82Ü
\82µ
\82½'
563 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82³
\82ê
\82é' + #10 + '
\82³
\82ê
\82È
\82¢'
565 WKANJI_DELIMITER = '
\93I' + #10 + '
\90«' + #10 + '
\8e®' + #10 + '
\89»' + #10 + '
\96@'
566 + #10 + '
\95s' + #10 + '
\96³' + #10 + '
\94ñ' + #10 + '
\94½'
568 WHIRA_FINAL_DELIMITER = '
\82Á
\82½' + #10 + '
\82Á
\82Ä'
570 + #10 + '
\82æ
\82Á
\82Ä' + #10 + '
\82µ
\82½
\82ª
\82Á
\82Ä' + #10 + '
\82È
\82Ì
\82Å'
571 + #10 + '
\82¾
\82©
\82ç' + #10 + '
\82Å
\82·
\82©
\82ç'
573 + #10 + '
\82µ
\82©
\82µ' + #10 + '
\82¾
\82ª' + #10 + '
\82¯
\82Ç' + #10 + '
\82¯
\82ê
\82Ç'
574 + #10 + '
\82â
\82Í
\82è' + #10 + '
\82â
\82Á
\82Ï
\82è'
575 + #10 + '
\82Å
\82µ' + #10 + '
\82¾
\82ë'
576 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82µ
\82½' + #10 + '
\82µ
\82È
\82¢'
578 // '
\81['
\82ð '
\82\9f\82¡
\82£
\82¥
\82§'
\82É
\81B
579 HA_LINE = '
\82 \82©
\82³
\82½
\82È
\82Í
\82Ü
\82â
\82ç
\82í
\82ª
\82´
\82¾
\82Î
\82Ï
\82\9f\82ì';
580 HI_LINE = '
\82¢
\82«
\82µ
\82¿
\82É
\82Ð
\82Ý
\82è
\82î
\82¬
\82¶
\82Ñ
\82Ò
\82¡';
581 HU_LINE = '
\82¤
\82
\82·
\82Â
\82Ê
\82Ó
\82Þ
\82ä
\82é
\82®
\82Ô
\82Õ
\82£';
582 HE_LINE = '
\82¦
\82¯
\82¹
\82Ä
\82Ë
\82Ö
\82ß
\82ê
\82ï
\82°
\82×
\82Ø
\82¥';
583 HO_LINE = '
\82¨
\82±
\82»
\82Æ
\82Ì
\82Ù
\82à
\82æ
\82ë
\82ð
\82²
\82Ú
\82Û
\82§';
584 KA_LINE = '
\83A
\83J
\83T
\83^
\83i
\83n
\83}
\83\84\83\89\83\8f\83K
\83U
\83_
\83o
\83p
\83@
\83\95\83\8e';
585 KI_LINE = '
\83C
\83L
\83V
\83`
\83j
\83q
\83~
\83\8a\83\90\83M
\83W
\83r
\83s
\83B';
586 KU_LINE = '
\83E
\83N
\83X
\83c
\83k
\83t
\83\80\83\86\83\8b\83O
\83u
\83v
\83D
\83\94';
587 KE_LINE = '
\83G
\83P
\83Z
\83e
\83l
\83w
\83\81\83\8c\83\91\83Q
\83x
\83y
\83F
\83\96';
588 KO_LINE = '
\83I
\83R
\83\
\83g
\83m
\83z
\83\82\83\88\83\8d\83\92\83S
\83{
\83|
\83H';
589 kKanji = [$80..$A0, $E0..$ff];
592 wHiraDelimiter := TStringList.Create;
593 wHiraFinalDelimiter := TStringList.Create;
594 wKanjiDelimiter := TStringList.Create;
595 words := TStringList.Create;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599 wHiraDelimiter.Text := WHIRA_DELIMITER;
600 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
602 wKanjiDelimiter.Text := WKANJI_DELIMITER;
604 tail := p + Length( text );
607 while p < tail do begin
608 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ð
\94»
\95Ê
609 //
\81¦
\8bå
\93Ç
\93_
\82Í ModeGraph
\82É
\82È
\82é
\82Ì
\82Å
\8cÂ
\95Ê
\82É
\91Î
\89\9e\82µ
\82È
\82
\82Ä
\82à
\82¢
\82¢
610 // if Byte(Byte( p^ ) - $a1) < $5e then begin
611 if Byte( p^ ) in kKanji then begin
612 if p + 1 < tail then begin
613 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
615 //
\83X
\83y
\81[
\83X
\82Å
\92P
\8cê
\95ª
\82¯
\82¹
\82¸
\82É
\8bl
\82ß
\82é
616 //$8140: newMode := ModeWhite;
617 $8141..$824e: newMode := ModeWGraph;
618 $824f..$8258: newMode := ModeWNum;
619 $8260..$829a: newMode := ModeWAlpha;
620 $829f..$82f1: newMode := ModeWHira;
621 $8340..$8396: newMode := ModeWKata;
622 else newMode := ModeWKanji;
624 // '
\81J
\81K
\81['
\82Í
\95½
\89¼
\96¼
\81A
\82Ü
\82½
\82Í
\83J
\83^
\83J
\83i
\82É
\8aÜ
\82Ü
\82ê
\82é
625 if (mode = ModeWHira) or (mode = ModeWKata) then
626 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
629 newMode := ModeWhite;
634 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636 //
\8d¡
\82Ü
\82Å
\93ú
\96{
\8cê
\82Å
\8d¡
\83X
\83y
\81[
\83X
637 //
\92P
\8cê
\82ð
\8cq
\82°
\82Ä
\8cã
\82Å
\83X
\83y
\81[
\83X
\82ð
\8bl
\82ß
\82é
638 //
\81¦
\94¼
\8ap
\83J
\83i
\82Í
\92Ê
\8fí
\83X
\83y
\81[
\83X
\82Å
\8bæ
\90Ø
\82é
\82¾
\82ë
\82¤
\82©
\82ç
\8bl
\82ß
\82È
\82¢
645 if mode <> newMode then begin
647 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ª
\95Ï
\8dX
\82³
\82ê
\82½
648 if mode <> ModeWhite then begin
649 SetLength( aWord, p - last );
650 CopyMemory( PChar( aWord ), last, p - last );
652 words.Text := changeMode( aWord, mode );
655 addWord( wordCount, words );
666 if mode <> ModeWhite then begin
667 SetLength( aWord, p - last );
668 CopyMemory( PChar( aWord ), last, p - last );
670 words.Text := changeMode( aWord, mode );
673 addWord( wordCount, words );
677 wKanjiDelimiter.Free;
678 wHiraFinalDelimiter.Free;
684 //==============================
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
689 function p( const aWord : string ) : Single;
693 info := Objects[ aWord ];
696 else if info.NormalWord = 0 then
698 else if info.ImportantWord = 0 then
700 else if info.ImportantWord + info.NormalWord * 2 < 5 then
703 Result := ( info.ImportantWord / info.ImportantText ) /
704 ((info.NormalWord * 2 / info.NormalText ) +
705 (info.ImportantWord / info.ImportantText));
717 if wordCount.Count = 0 then
720 narray := TList.Create;
722 for i := 0 to wordCount.Count - 1 do begin
723 narray.Add( Pointer( p( wordCount[ i ] ) ) );
726 narray.Sort( AbsSort );
730 i := min( SAMPLE_COUNT, narray.Count );
734 s := s * Single( narray[ i ] );
735 q := q * (1 - Single( narray[ i ] ));
738 Result := s / (s + q);
745 //==============================
747 //==============================
748 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
750 function p( const aWord : string ) : Single;
754 info := Objects[ aWord ];
757 else if info.ImportantWord = 0 then
759 else if info.NormalWord = 0 then
762 Result := ( info.ImportantWord / info.ImportantText ) /
763 ((info.NormalWord / info.NormalText ) +
764 (info.ImportantWord / info.ImportantText));
767 function f( cnt : Integer; n, mean : Single ) : Extended;
771 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
776 narray : array of Single;
778 countInfo : TWordCountInfo;
780 P1, Q1, R1 : Extended;
784 if wordCount.Count = 0 then begin
789 SetLength( narray, wordCount.Count );
791 for i := 0 to wordCount.Count - 1 do begin
792 n := p( wordCount[ i ] );
796 mean := mean / wordCount.Count;
800 for i := 0 to wordCount.Count - 1 do begin
801 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
802 n := f( countInfo.WordCount, narray[ i ], mean );
803 P1 := P1 * ( 1 - n );
806 cnt := wordCount.Count;
810 P1 := 1 - Power( P1, 1 / cnt );
811 Q1 := 1 - Power( Q1, 1 / cnt );
813 if P1 + Q1 = 0 then begin
816 n := (P1 - Q1) / (P1 + Q1);
817 Result := (1 + n) / 2;
822 //==============================
823 // CalcGaryRobinsonFisher
824 //==============================
825 function TGikoBayesian.CalcGaryRobinsonFisher(
826 wordCount : TWordCount
829 function p( const aWord : string ) : Single;
833 info := Objects[ aWord ];
836 else if info.ImportantWord = 0 then
838 else if info.NormalWord = 0 then
841 Result := info.ImportantWord /
842 (info.ImportantWord + info.NormalWord *
843 info.ImportantText / info.NormalText);
846 function f( cnt : Integer; n, mean : Single ) : Extended;
850 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
853 function prbx( x2, degree : Extended ) : Extended;
866 while i < (degree / 2 - 1) do begin
867 term := term + ln( m / i );
868 sum := sum + exp( term );
881 narray : array of Single;
883 countInfo : TWordCountInfo;
886 important : Extended;
891 if wordCount.Count = 0 then begin
896 SetLength( narray, wordCount.Count );
898 for i := 0 to wordCount.Count - 1 do begin
899 n := p( wordCount[ i ] );
903 mean := mean / wordCount.Count;
913 for i := 0 to wordCount.Count - 1 do begin
914 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
915 n := f( countInfo.WordCount, narray[ i ], mean );
916 if countInfo <> nil then
917 cnt := cnt + countInfo.WordCount;
919 P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
920 Q1 := Q1 + Ln( n ) * countInfo.WordCount;
922 P1 := P1 + Ln( 1 - n );
929 P1 := prbx( -2 * P1, 2 * cnt );
930 Q1 := prbx( -2 * Q1, 2 * cnt );
932 P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
933 Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
935 if P1 + Q1 = 0 then begin
938 Result := (1 + Q1 + P1) / 2;
943 //==============================
945 //==============================
946 function TGikoBayesian.Parse(
948 wordCount : TWordCount;
949 algorithm : TGikoBayesianAlgorithm
953 CountWord( text, wordCount );
955 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
956 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
957 gbaGaryRobinsonFisher:
958 Result := CalcGaryRobinsonFisher( wordCount );
964 //==============================
966 //==============================
967 procedure TGikoBayesian.Learn(
968 wordCount : TWordCount;
969 isImportant : Boolean );
972 wordinfo : TWordInfo;
973 countinfo : TWordCountInfo;
977 for i := 0 to wordCount.Count - 1 do begin
978 aWord := wordCount[ i ];
979 wordinfo := Objects[ aWord ];
980 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
981 if wordinfo = nil then begin
982 wordinfo := TWordInfo.Create;
983 Objects[ aWord ] := wordinfo;
986 if isImportant then begin
987 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
988 wordinfo.ImportantText := wordinfo.ImportantText + 1;
990 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
991 wordinfo.NormalText := wordinfo.NormalText + 1;
997 //==============================
999 //==============================
1000 procedure TGikoBayesian.Forget(
1001 wordCount : TWordCount;
1002 isImportant : Boolean );
1005 wordinfo : TWordInfo;
1006 countinfo : TWordCountInfo;
1010 for i := 0 to wordCount.Count - 1 do begin
1011 aWord := wordCount[ i ];
1012 wordinfo := Objects[ aWord ];
1013 if wordinfo = nil then
1016 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1017 if isImportant then begin
1018 if wordInfo.ImportantText > 0 then begin
1019 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1020 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1023 if wordinfo.NormalText > 0 then begin
1024 wordinfo.NormalText := wordinfo.NormalText - 1;
1025 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;