5 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
7 $Id: GikoBayesian.pas,v 1.11 2004/10/31 16:48:44 yoffy Exp $
12 //==================================================
14 //==================================================
17 //==================================================
19 //==================================================
21 {!***********************************************************
22 \brief
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
23 ************************************************************}
24 TWordInfo = class( TObject )
26 FNormalWord : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
27 FImportantWord : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
28 FNormalText : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
29 FImportantText : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
38 {!***********************************************************
39 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
40 ************************************************************}
41 TWordCountInfo = class( TObject )
43 FWordCount : Integer; //!<
\92P
\8cê
\90\94
46 property WordCount : Integer read FWordCount write FWordCount;
49 {!***********************************************************
50 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83\8a\83X
\83g
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) //
\8c\83\92x
53 TWordCount = class( TStringList )
56 destructor Destroy; override;
59 {!***********************************************************
60 \brief
\83t
\83B
\83\8b\83^
\83A
\83\8b\83S
\83\8a\83Y
\83\80
61 ************************************************************}
62 TGikoBayesianAlgorithm =
63 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
65 {!***********************************************************
66 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
67 ************************************************************}
68 // TGikoBayesian = class( THashedStringList ) //
\8c\83\92x
69 TGikoBayesian = class( TStringList )
71 FFilePath : string; //!<
\93Ç
\82Ý
\8d\9e\82ñ
\82¾
\83t
\83@
\83C
\83\8b\83p
\83X
72 function GetObject( const name : string ) : TWordInfo;
73 procedure SetObject( const name : string; value : TWordInfo );
77 destructor Destroy; override;
79 //!
\83t
\83@
\83C
\83\8b\82©
\82ç
\8aw
\8fK
\97\9a\97ð
\82ð
\93Ç
\82Ý
\8fo
\82µ
\82Ü
\82·
80 procedure LoadFromFile( const filePath : string );
82 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
83 procedure SaveToFile( const filePath : string );
85 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
88 //!
\92P
\8cê
\82É
\91Î
\82·
\82é
\8fî
\95ñ
\82ð
\8eæ
\93¾
\82µ
\82Ü
\82·
89 property Objects[ const name : string ] : TWordInfo
90 read GetObject write SetObject; default;
92 //!
\95¶
\8fÍ
\82É
\8aÜ
\82Ü
\82ê
\82é
\92P
\8cê
\82ð
\83J
\83E
\83\93\83g
\82µ
\82Ü
\82·
95 wordCount : TWordCount );
98 \brief Paul Graham
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
99 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
101 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
104 \brief GaryRobinson
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
105 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
107 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
110 \brief GaryRobinson-Fisher
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
111 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
113 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
116 \brief
\95¶
\8fÍ
\82ð
\89ð
\90Í
117 \param text
\89ð
\90Í
\82·
\82é
\95¶
\8fÍ
118 \param wordCount
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
\82ª
\95Ô
\82é
119 \param algorithm
\92\8d\96Ú
\93x
\82Ì
\8c\88\92è
\82É
\97p
\82¢
\82é
\83A
\83\8b\83S
\83\8a\83Y
\83\80\82ð
\8ew
\92è
\82µ
\82Ü
\82·
120 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
122 CountWord
\82Æ Calcxxxxx
\82ð
\82Ü
\82Æ
\82ß
\82Ä
\8eÀ
\8ds
\82·
\82é
\82¾
\82¯
\82Å
\82·
\81B
126 wordCount : TWordCount;
127 algorithm : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
131 \brief
\8aw
\8fK
\82·
\82é
132 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
133 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82é
\82È
\82ç True
136 wordCount : TWordCount;
137 isImportant : Boolean );
140 \brief
\8aw
\8fK
\8c\8b\89Ê
\82ð
\96Y
\82ê
\82é
141 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
142 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82ç
\82ê
\82Ä
\82¢
\82½
\82È
\82ç True
143 \warning
\8aw
\8fK
\8dÏ
\82Ý
\82Ì
\95¶
\8fÍ
\82©
\82Ç
\82¤
\82©
\82Í
\8am
\94F
\8fo
\97\88\82Ü
\82¹
\82ñ
\81B<br>
144 Learn
\82µ
\82Ä
\82¢
\82È
\82¢
\95¶
\8fÍ
\82â isImportant
\82ª
\8aÔ
\88á
\82Á
\82Ä
\82¢
\82é
\95¶
\8fÍ
\82ð
145 Forget
\82·
\82é
\82Æ
\83f
\81[
\83^
\83x
\81[
\83X
\82ª
\94j
\91¹
\82µ
\82Ü
\82·
\81B<br>
146 \8aw
\8fK
\8dÏ
\82Ý
\82©
\82Ç
\82¤
\82©
\82Í
\93Æ
\8e©
\82É
\8aÇ
\97\9d\82µ
\82Ä
\82
\82¾
\82³
\82¢
\81B
148 \91S
\82Ä
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82ð
\83N
\83\8a\83A
\82·
\82é
\82í
\82¯
\82Å
\82Í
\82 \82è
\82Ü
\82¹
\82ñ
\81B<br>
149 wordCount
\82ð
\93¾
\82½
\95¶
\8fÍ (Parse
\82Ì text
\88ø
\90\94)
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82Ì
\82Ý
\83N
\83\8a\83A
\82µ
\82Ü
\82·
\81B<br><br>
151 \8eå
\82É
\92\8d\96Ú
\95¶
\8fÍ
\82Æ
\94ñ
\92\8d\96Ú
\95¶
\8fÍ
\82ð
\90Ø
\82è
\91Ö
\82¦
\82é
\82½
\82ß
\82É Forget -> Learn
\82Ì
\8f\87\82Å
\8eg
\97p
\82µ
\82Ü
\82·
\81B
154 wordCount : TWordCount;
155 isImportant : Boolean );
158 //==================================================
160 //==================================================
163 SysUtils, Math, Windows;
166 GIKO_BAYESIAN_FILE_VERSION = '1.0';
168 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
169 ModeWGraph, ModeWAlpha, ModeWNum,
170 ModeWHira, ModeWKata, ModeWKanji);
172 CharMode1 : array [ 0..255 ] of Byte =
174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
177 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
178 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
179 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
180 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
181 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
186 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
187 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
188 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
193 //************************************************************
195 //************************************************************
197 //==============================
199 //==============================
200 function RemoveToken(var s: string;const delimiter: string): string;
204 p := AnsiPos(delimiter, s);
208 Result := Copy(s, 1, p - 1);
209 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
212 //==============================
214 //==============================
215 function AbsSort( p1, p2 : Pointer ) : Integer;
220 v1 := Abs( Single( p1 ) - 0.5 );
221 v2 := Abs( Single( p2 ) - 0.5 );
231 //************************************************************
233 //************************************************************
234 constructor TWordCount.Create;
237 Duplicates := dupIgnore;
238 CaseSensitive := True;
243 destructor TWordCount.Destroy;
248 for i := Count - 1 downto 0 do
249 if Objects[ i ] <> nil then
256 //************************************************************
257 // TGikoBayesian class
258 //************************************************************
260 //==============================
262 //==============================
263 constructor TGikoBayesian.Create;
266 Duplicates := dupIgnore;
267 CaseSensitive := True;
272 //==============================
274 //==============================
275 destructor TGikoBayesian.Destroy;
280 for i := Count - 1 downto 0 do
281 if inherited Objects[ i ] <> nil then
282 inherited Objects[ i ].Free;
288 procedure TGikoBayesian.LoadFromFile( const filePath : string );
297 FFilePath := filePath;
299 if not FileExists( filePath ) then
302 sl := TStringList.Create;
304 sl.LoadFromFile( filePath );
306 for i := 1 to sl.Count - 1 do begin
308 name := RemoveToken( s, #1 );
309 info := TWordInfo.Create;
310 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
311 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
312 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
313 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315 AddObject( name, info );
323 procedure TGikoBayesian.SaveToFile( const filePath : string );
331 FFilePath := filePath;
333 sl := TStringList.Create;
336 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
338 for i := 0 to Count - 1 do begin
339 info := TWordInfo( inherited Objects[ i ] );
340 s := Strings[ i ] + #1
341 + Format('%x', [info.NormalWord]) + #1
342 + Format('%x', [info.ImportantWord]) + #1
343 + Format('%x', [info.NormalText]) + #1
344 + Format('%x', [info.ImportantText]);
349 sl.SaveToFile( filePath );
356 procedure TGikoBayesian.Save;
359 if FFilePath <> '' then
360 SaveToFile( FFilePath );
364 //==============================
366 //==============================
367 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
372 if Find( name, idx ) then
373 Result := TWordInfo( inherited Objects[ idx ] )
379 //==============================
381 //==============================
382 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
387 if Find( name, idx ) then
388 inherited Objects[ idx ] := value
390 AddObject( name, value );
395 //==============================
397 //==============================
398 procedure TGikoBayesian.CountWord(
400 wordCount : TWordCount );
402 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
403 ModeWGraph, ModeWAlpha, ModeWNum,
404 ModeWHira, ModeWKata, ModeWKanji);
406 p, tail, last : PChar;
407 mode, newMode : Modes;
411 delimiter : TStringList;
414 countInfo : TWordCountInfo;
416 KAKUJOSI = '
\82ð' + #10 + '
\82É' + #10 + '
\82ª' + #10 + '
\82Æ' + #10 + '
\82©
\82ç'
417 + #10 + '
\82Å' + #10 + '
\82Ö' + #10 + '
\82æ
\82è' + #10 + '
\82Ü
\82Å'
419 + #10 + '
\82±
\82ê' + #10 + '
\82»
\82ê' + #10 + '
\82 \82ê' + #10 + '
\82Ç
\82ê'
420 + #10 + '
\82±
\82Ì' + #10 + '
\82»
\82Ì' + #10 + '
\82 \82Ì' + #10 + '
\82Ç
\82Ì'
421 + #10 + '
\82±
\82¤' + #10 + '
\82»
\82¤' + #10 + '
\82 \82 ' + #10 + '
\82Ç
\82¤'
422 + #10 + '
\82±
\82ñ
\82È' + #10 + '
\82»
\82ñ
\82È' + #10 + '
\82 \82ñ
\82È' + #10 + '
\82Ç
\82ñ
\82È'
423 + #10 + '
\93I' + #10 + '
\90«' + #10 + '
\8e®' + #10 + '
\89»' + #10 + '
\96@'
424 + #10 + '
\95s' + #10 + '
\96³' + #10 + '
\94ñ'
426 + #10 + '
\82µ
\82©
\82µ' + #10 + '
\82¾
\82ª' + #10 + '
\82¯
\82Ç' + #10 + '
\82¯
\82ê
\82Ç'
427 + #10 + '
\82â
\82Í
\82è' + #10 + '
\82â
\82Á
\82Ï
\82è'
428 + #10 + '
\82Å
\82·' + #10 + '
\82Ü
\82·' + #10 + '
\82Å
\82µ' + #10 + '
\82¾
\82ë'
429 + #10 + '
\82·
\82é' + #10 + '
\82µ
\82È
\82¢' + #10 + '
\82µ
\82½' + #10 + '
\82µ
\82È
\82¢'
431 kKanji = [$80..$A0, $E0..$ff];
434 delimiter := TStringList.Create;
437 delimiter.Text := KAKUJOSI;
439 tail := p + Length( text );
442 while p < tail do begin
444 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ð
\94»
\95Ê
445 //
\81¦
\8bå
\93Ç
\93_
\82Í ModeGraph
\82É
\82È
\82é
\82Ì
\82Å
\8cÂ
\95Ê
\82É
\91Î
\89\9e\82µ
\82È
\82
\82Ä
\82à
\82¢
\82¢
446 // if Byte(Byte( p^ ) - $a1) < $5e then begin
447 if Byte( p^ ) in kKanji then begin
448 if p + 1 < tail then begin
449 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
451 $8140: newMode := ModeWhite;
452 $8141..$824e: newMode := ModeWGraph;
453 $824f..$8258: newMode := ModeWNum;
454 $8260..$829a: newMode := ModeWAlpha;
455 $829f..$82f1: newMode := ModeWHira;
456 $8340..$8396: newMode := ModeWKata;
457 else newMode := ModeWKanji;
459 // '
\81J
\81K
\81['
\82Í
\95½
\89¼
\96¼
\81A
\82Ü
\82½
\82Í
\83J
\83^
\83J
\83i
\82É
\8aÜ
\82Ü
\82ê
\82é
460 if (mode = ModeWHira) or (mode = ModeWKata) then
461 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
464 newMode := ModeWhite;
469 //
\8bæ
\90Ø
\82è
\82É
\82È
\82é
\95¶
\8e\9a\82ª
\82 \82é
\82©
\8c\9f\8d¸
\82·
\82é
470 if p + 3 < tail then begin // 3 = delimiter
\82Ì
\8dÅ
\91å
\8e\9a\90\94 - 1
471 for i := 0 to delimiter.Count - 1 do begin
473 p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin
475 chSize := Length( delimiter[ i ] );
481 newMode := Modes( CharMode1[ Byte( p^ ) ] );
486 if (mode <> newMode) or delimited then begin
488 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ª
\95Ï
\8dX
\82³
\82ê
\82½
489 if mode <> ModeWhite then begin
490 SetLength( aWord, p - last );
491 CopyMemory( PChar( aWord ), last, p - last );
492 //aWord := Copy( last, 0, p - last );
493 if wordCount.Find( aWord, idx ) then begin
494 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
496 countInfo := TWordCountInfo.Create;
497 wordCount.AddObject( aWord, countInfo );
499 countInfo.WordCount := countInfo.WordCount + 1;
504 //
\8bæ
\90Ø
\82è
\82É
\82È
\82é
\95¶
\8e\9a\82É
\91\98\8bö
\82µ
\82½
505 if delimited then begin
506 SetLength( aWord, chSize );
507 CopyMemory( PChar( aWord ), last, chSize );
508 //aWord := Copy( last, 0, p - last );
509 if wordCount.Find( aWord, idx ) then begin
510 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
512 countInfo := TWordCountInfo.Create;
513 wordCount.AddObject( aWord, countInfo );
515 countInfo.WordCount := countInfo.WordCount + 1;
516 last := last + chSize;
526 if mode <> ModeWhite then begin
527 aWord := Copy( last, 0, p - last );
528 if wordCount.Find( aWord, idx ) then begin
529 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
531 countInfo := TWordCountInfo.Create;
532 wordCount.AddObject( aWord, countInfo );
534 countInfo.WordCount := countInfo.WordCount + 1;
542 //==============================
544 //==============================
545 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
547 function p( const aWord : string ) : Single;
551 info := Objects[ aWord ];
554 else if info.NormalWord = 0 then
556 else if info.ImportantWord = 0 then
558 else if info.ImportantWord + info.NormalWord * 2 < 5 then
561 Result := ( info.ImportantWord / info.ImportantText ) /
562 ((info.NormalWord * 2 / info.NormalText ) +
563 (info.ImportantWord / info.ImportantText));
575 if wordCount.Count = 0 then
578 narray := TList.Create;
580 for i := 0 to wordCount.Count - 1 do begin
581 narray.Add( Pointer( p( wordCount[ i ] ) ) );
584 narray.Sort( AbsSort );
588 i := min( SAMPLE_COUNT, narray.Count );
592 s := s * Single( narray[ i ] );
593 q := q * (1 - Single( narray[ i ] ));
596 Result := s / (s + q);
603 //==============================
605 //==============================
606 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
608 function p( const aWord : string ) : Single;
612 info := Objects[ aWord ];
615 else if info.ImportantWord = 0 then
617 else if info.NormalWord = 0 then
620 Result := ( info.ImportantWord / info.ImportantText ) /
621 ((info.NormalWord / info.NormalText ) +
622 (info.ImportantWord / info.ImportantText));
625 function f( cnt : Integer; n, mean : Single ) : Extended;
629 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
634 narray : array of Single;
636 countInfo : TWordCountInfo;
638 P1, Q1, R1 : Extended;
642 if wordCount.Count = 0 then begin
647 SetLength( narray, wordCount.Count );
649 for i := 0 to wordCount.Count - 1 do begin
650 n := p( wordCount[ i ] );
654 mean := mean / wordCount.Count;
658 for i := 0 to wordCount.Count - 1 do begin
659 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
660 n := f( countInfo.WordCount, narray[ i ], mean );
661 P1 := P1 * ( 1 - n );
664 cnt := wordCount.Count;
668 P1 := 1 - Power( P1, 1 / cnt );
669 Q1 := 1 - Power( Q1, 1 / cnt );
671 if P1 + Q1 = 0 then begin
674 n := (P1 - Q1) / (P1 + Q1);
675 Result := (1 + n) / 2;
680 //==============================
681 // CalcGaryRobinsonFisher
682 //==============================
683 function TGikoBayesian.CalcGaryRobinsonFisher(
684 wordCount : TWordCount
687 function p( const aWord : string ) : Single;
691 info := Objects[ aWord ];
694 else if info.ImportantWord = 0 then
696 else if info.NormalWord = 0 then
699 Result := info.ImportantWord /
700 (info.ImportantWord + info.NormalWord *
701 info.ImportantText / info.NormalText);
704 function f( cnt : Integer; n, mean : Single ) : Extended;
708 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
711 function prbx( x2, degree : Extended ) : Extended;
724 while i < (degree / 2 - 1) do begin
725 term := term + ln( m / i );
726 sum := sum + exp( term );
739 narray : array of Single;
741 countInfo : TWordCountInfo;
744 important : Extended;
749 if wordCount.Count = 0 then begin
754 SetLength( narray, wordCount.Count );
756 for i := 0 to wordCount.Count - 1 do begin
757 n := p( wordCount[ i ] );
761 mean := mean / wordCount.Count;
771 for i := 0 to wordCount.Count - 1 do begin
772 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
773 n := f( countInfo.WordCount, narray[ i ], mean );
774 if countInfo <> nil then
775 cnt := cnt + countInfo.WordCount;
777 P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
778 Q1 := Q1 + Ln( n ) * countInfo.WordCount;
780 P1 := P1 + Ln( 1 - n );
787 P1 := prbx( -2 * P1, 2 * cnt );
788 Q1 := prbx( -2 * Q1, 2 * cnt );
790 P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
791 Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
793 if P1 + Q1 = 0 then begin
796 Result := (1 + Q1 + P1) / 2;
801 //==============================
803 //==============================
804 function TGikoBayesian.Parse(
806 wordCount : TWordCount;
807 algorithm : TGikoBayesianAlgorithm
811 CountWord( text, wordCount );
813 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
814 gbaGaryRobinson: Result := CalcGaryRobinson( wordCount );
815 gbaGaryRobinsonFisher:
816 Result := CalcGaryRobinsonFisher( wordCount );
822 //==============================
824 //==============================
825 procedure TGikoBayesian.Learn(
826 wordCount : TWordCount;
827 isImportant : Boolean );
830 wordinfo : TWordInfo;
831 countinfo : TWordCountInfo;
835 for i := 0 to wordCount.Count - 1 do begin
836 aWord := wordCount[ i ];
837 wordinfo := Objects[ aWord ];
838 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
839 if wordinfo = nil then begin
840 wordinfo := TWordInfo.Create;
841 Objects[ aWord ] := wordinfo;
844 if isImportant then begin
845 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
846 wordinfo.ImportantText := wordinfo.ImportantText + 1;
848 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
849 wordinfo.NormalText := wordinfo.NormalText + 1;
855 //==============================
857 //==============================
858 procedure TGikoBayesian.Forget(
859 wordCount : TWordCount;
860 isImportant : Boolean );
863 wordinfo : TWordInfo;
864 countinfo : TWordCountInfo;
868 for i := 0 to wordCount.Count - 1 do begin
869 aWord := wordCount[ i ];
870 wordinfo := Objects[ aWord ];
871 if wordinfo = nil then
874 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
875 if isImportant then begin
876 if wordInfo.ImportantText > 0 then begin
877 wordinfo.ImportantText := wordinfo.ImportantText - 1;
878 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
881 if wordinfo.NormalText > 0 then begin
882 wordinfo.NormalText := wordinfo.NormalText - 1;
883 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;