5 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
7 $Id: GikoBayesian.pas,v 1.8 2004/10/21 05:59:39 yoffy Exp $
12 //==================================================
14 //==================================================
17 //==================================================
19 //==================================================
21 {!***********************************************************
22 \brief
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
23 ************************************************************}
24 TWordInfo = class( TObject )
26 FNormalWord : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
27 FImportantWord : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\93o
\8fê
\82µ
\82½
\89ñ
\90\94
28 FNormalText : Integer; //!<
\92Ê
\8fí
\82Ì
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
29 FImportantText : Integer; //!<
\92\8d\96Ú
\92P
\8cê
\82Æ
\82µ
\82Ä
\8aÜ
\82Ü
\82ê
\82Ä
\82¢
\82½
\95¶
\8fÍ
\82Ì
\90\94
32 property NormalWord : Integer read FNormalWord write FNormalWord;
33 property ImportantWord : Integer read FImportantWord write FImportantWord;
34 property NormalText : Integer read FNormalText write FNormalText;
35 property ImportantText : Integer read FImportantText write FImportantText;
38 {!***********************************************************
39 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83v
\83\8d\83p
\83e
\83B
40 ************************************************************}
41 TWordCountInfo = class( TObject )
43 FWordCount : Integer; //!<
\92P
\8cê
\90\94
46 property WordCount : Integer read FWordCount write FWordCount;
49 {!***********************************************************
50 \brief
\89ð
\90Í
\8dÏ
\82Ý
\92P
\8cê
\83\8a\83X
\83g
51 ************************************************************}
52 // TWordCount = class( THashedStringList ) //
\8c\83\92x
53 TWordCount = class( TStringList )
56 destructor Destroy; override;
59 {!***********************************************************
60 \brief
\83t
\83B
\83\8b\83^
\83A
\83\8b\83S
\83\8a\83Y
\83\80
61 ************************************************************}
62 TGikoBayesianAlgorithm =
63 (gbaPaulGraham, gbaGaryRonbinson{, gbaGaryRonbinsonFisher});
65 {!***********************************************************
66 \brief
\83x
\83C
\83W
\83A
\83\93\83t
\83B
\83\8b\83^
67 ************************************************************}
68 // TGikoBayesian = class( THashedStringList ) //
\8c\83\92x
69 TGikoBayesian = class( TStringList )
71 FFilePath : string; //!<
\93Ç
\82Ý
\8d\9e\82ñ
\82¾
\83t
\83@
\83C
\83\8b\83p
\83X
72 function GetObject( const name : string ) : TWordInfo;
73 procedure SetObject( const name : string; value : TWordInfo );
77 destructor Destroy; override;
79 //!
\83t
\83@
\83C
\83\8b\82©
\82ç
\8aw
\8fK
\97\9a\97ð
\82ð
\93Ç
\82Ý
\8fo
\82µ
\82Ü
\82·
80 procedure LoadFromFile( const filePath : string );
82 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
83 procedure SaveToFile( const filePath : string );
85 //!
\83t
\83@
\83C
\83\8b\82É
\8aw
\8fK
\97\9a\97ð
\82ð
\95Û
\91¶
\82µ
\82Ü
\82·
88 //!
\92P
\8cê
\82É
\91Î
\82·
\82é
\8fî
\95ñ
\82ð
\8eæ
\93¾
\82µ
\82Ü
\82·
89 property Objects[ const name : string ] : TWordInfo
90 read GetObject write SetObject; default;
92 //!
\95¶
\8fÍ
\82É
\8aÜ
\82Ü
\82ê
\82é
\92P
\8cê
\82ð
\83J
\83E
\83\93\83g
\82µ
\82Ü
\82·
95 wordCount : TWordCount );
98 \brief Paul Graham
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
99 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
101 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
104 \brief GaryRobinson
\96@
\82É
\8aî
\82Ã
\82¢
\82Ä
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x
\82ð
\8c\88\92è
\82µ
\82Ü
\82·
105 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
107 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
109 // function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
112 \brief
\95¶
\8fÍ
\82ð
\89ð
\90Í
113 \param text
\89ð
\90Í
\82·
\82é
\95¶
\8fÍ
114 \param wordCount
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
\82ª
\95Ô
\82é
115 \param algorithm
\92\8d\96Ú
\93x
\82Ì
\8c\88\92è
\82É
\97p
\82¢
\82é
\83A
\83\8b\83S
\83\8a\83Y
\83\80\82ð
\8ew
\92è
\82µ
\82Ü
\82·
116 \return
\95¶
\8fÍ
\82Ì
\92\8d\96Ú
\93x (
\92\8d\96Ú
\82É
\92l
\82µ
\82È
\82¢ 0.0
\81`1.0
\92\8d\96Ú
\82·
\82×
\82«)
118 CountWord
\82Æ Calcxxxxx
\82ð
\82Ü
\82Æ
\82ß
\82Ä
\8eÀ
\8ds
\82·
\82é
\82¾
\82¯
\82Å
\82·
\81B
122 wordCount : TWordCount;
123 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
127 \brief
\8aw
\8fK
\82·
\82é
128 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
129 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82é
\82È
\82ç True
132 wordCount : TWordCount;
133 isImportant : Boolean );
136 \brief
\8aw
\8fK
\8c\8b\89Ê
\82ð
\96Y
\82ê
\82é
137 \param wordCount Parse
\82Å
\89ð
\90Í
\82³
\82ê
\82½
\92P
\8cê
\83\8a\83X
\83g
138 \param isImportant
\92\8d\96Ú
\82·
\82×
\82«
\95¶
\8fÍ
\82Æ
\82µ
\82Ä
\8ao
\82¦
\82ç
\82ê
\82Ä
\82¢
\82½
\82È
\82ç True
139 \warning
\8aw
\8fK
\8dÏ
\82Ý
\82Ì
\95¶
\8fÍ
\82©
\82Ç
\82¤
\82©
\82Í
\8am
\94F
\8fo
\97\88\82Ü
\82¹
\82ñ
\81B<br>
140 Learn
\82µ
\82Ä
\82¢
\82È
\82¢
\95¶
\8fÍ
\82â isImportant
\82ª
\8aÔ
\88á
\82Á
\82Ä
\82¢
\82é
\95¶
\8fÍ
\82ð
141 Forget
\82·
\82é
\82Æ
\83f
\81[
\83^
\83x
\81[
\83X
\82ª
\94j
\91¹
\82µ
\82Ü
\82·
\81B<br>
142 \8aw
\8fK
\8dÏ
\82Ý
\82©
\82Ç
\82¤
\82©
\82Í
\93Æ
\8e©
\82É
\8aÇ
\97\9d\82µ
\82Ä
\82
\82¾
\82³
\82¢
\81B
144 \91S
\82Ä
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82ð
\83N
\83\8a\83A
\82·
\82é
\82í
\82¯
\82Å
\82Í
\82 \82è
\82Ü
\82¹
\82ñ
\81B<br>
145 wordCount
\82ð
\93¾
\82½
\95¶
\8fÍ (Parse
\82Ì text
\88ø
\90\94)
\82Ì
\8aw
\8fK
\8c\8b\89Ê
\82Ì
\82Ý
\83N
\83\8a\83A
\82µ
\82Ü
\82·
\81B<br><br>
147 \8eå
\82É
\92\8d\96Ú
\95¶
\8fÍ
\82Æ
\94ñ
\92\8d\96Ú
\95¶
\8fÍ
\82ð
\90Ø
\82è
\91Ö
\82¦
\82é
\82½
\82ß
\82É Forget -> Learn
\82Ì
\8f\87\82Å
\8eg
\97p
\82µ
\82Ü
\82·
\81B
150 wordCount : TWordCount;
151 isImportant : Boolean );
154 //==================================================
156 //==================================================
159 SysUtils, Math, Windows;
162 GIKO_BAYESIAN_FILE_VERSION = '1.0';
164 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
165 ModeWGraph, ModeWAlpha, ModeWNum,
166 ModeWHira, ModeWKata, ModeWKanji);
168 CharMode1 : array [ 0..255 ] of Byte =
170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
174 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
175 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
176 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
177 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
182 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
183 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
184 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
189 //************************************************************
191 //************************************************************
193 //==============================
195 //==============================
196 function RemoveToken(var s: string;const delimiter: string): string;
200 p := AnsiPos(delimiter, s);
204 Result := Copy(s, 1, p - 1);
205 s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
208 //==============================
210 //==============================
211 function AbsSort( p1, p2 : Pointer ) : Integer;
216 v1 := Abs( Single( p1 ) - 0.5 );
217 v2 := Abs( Single( p2 ) - 0.5 );
227 //************************************************************
229 //************************************************************
230 constructor TWordCount.Create;
233 Duplicates := dupIgnore;
234 CaseSensitive := True;
239 destructor TWordCount.Destroy;
244 for i := Count - 1 downto 0 do
245 if Objects[ i ] <> nil then
252 //************************************************************
253 // TGikoBayesian class
254 //************************************************************
256 //==============================
258 //==============================
259 constructor TGikoBayesian.Create;
262 Duplicates := dupIgnore;
263 CaseSensitive := True;
268 //==============================
270 //==============================
271 destructor TGikoBayesian.Destroy;
276 for i := Count - 1 downto 0 do
277 if inherited Objects[ i ] <> nil then
278 inherited Objects[ i ].Free;
284 procedure TGikoBayesian.LoadFromFile( const filePath : string );
293 FFilePath := filePath;
295 if not FileExists( filePath ) then
298 sl := TStringList.Create;
300 sl.LoadFromFile( filePath );
302 for i := 1 to sl.Count - 1 do begin
304 name := RemoveToken( s, #1 );
305 info := TWordInfo.Create;
306 info.NormalWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
307 info.ImportantWord := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
308 info.NormalText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
309 info.ImportantText := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
311 AddObject( name, info );
319 procedure TGikoBayesian.SaveToFile( const filePath : string );
327 FFilePath := filePath;
329 sl := TStringList.Create;
332 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
334 for i := 0 to Count - 1 do begin
335 info := TWordInfo( inherited Objects[ i ] );
336 s := Strings[ i ] + #1
337 + Format('%x', [info.NormalWord]) + #1
338 + Format('%x', [info.ImportantWord]) + #1
339 + Format('%x', [info.NormalText]) + #1
340 + Format('%x', [info.ImportantText]);
345 sl.SaveToFile( filePath );
352 procedure TGikoBayesian.Save;
355 if FFilePath <> '' then
356 SaveToFile( FFilePath );
360 //==============================
362 //==============================
363 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
368 idx := IndexOf( name ); //
\8c\83\92x
372 Result := TWordInfo( inherited Objects[ idx ] );
376 //==============================
378 //==============================
379 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
384 idx := IndexOf( name );
386 AddObject( name, value )
388 inherited Objects[ idx ] := value;
393 //==============================
395 //==============================
396 procedure TGikoBayesian.CountWord(
398 wordCount : TWordCount );
400 Modes = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
401 ModeWGraph, ModeWAlpha, ModeWNum,
402 ModeWHira, ModeWKata, ModeWKanji);
404 p, tail, last : PChar;
405 mode, newMode : Modes;
409 delimiter : TStringList;
412 countInfo : TWordCountInfo;
414 KAKUJOSI = '
\82ð' + #10 + '
\82É' + #10 + '
\82ª' + #10 + '
\82Æ' + #10 + '
\82©
\82ç' +
415 #10 + '
\82Å' + #10 + '
\82Ö' + #10 + '
\82æ
\82è' + #10 + '
\82Ü
\82Å';
416 kKanji = [$80..$A0, $E0..$ff];
419 delimiter := TStringList.Create;
422 delimiter.Text := KAKUJOSI;
424 tail := p + Length( text );
427 while p < tail do begin
429 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ð
\94»
\95Ê
430 //
\81¦
\8bå
\93Ç
\93_
\82Í ModeGraph
\82É
\82È
\82é
\82Ì
\82Å
\8cÂ
\95Ê
\82É
\91Î
\89\9e\82µ
\82È
\82
\82Ä
\82à
\82¢
\82¢
431 // if Byte(Byte( p^ ) - $a1) < $5e then begin
432 if Byte( p^ ) in kKanji then begin
433 if p + 1 < tail then begin
434 ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
436 $8140: newMode := ModeWhite;
437 $8141..$824e: newMode := ModeWGraph;
438 $824f..$8258: newMode := ModeWNum;
439 $8260..$829a: newMode := ModeWAlpha;
440 $829f..$82f1: newMode := ModeWHira;
441 $8340..$8396: newMode := ModeWKata;
442 else newMode := ModeWKanji;
444 // '
\81J
\81K
\81['
\82Í
\95½
\89¼
\96¼
\81A
\82Ü
\82½
\82Í
\83J
\83^
\83J
\83i
\82É
\8aÜ
\82Ü
\82ê
\82é
445 if (mode = ModeWHira) or (mode = ModeWKata) then
446 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
449 newMode := ModeWhite;
454 //
\8bæ
\90Ø
\82è
\82É
\82È
\82é
\95¶
\8e\9a\82ª
\82 \82é
\82©
\8c\9f\8d¸
\82·
\82é
455 if p + 3 < tail then begin // 3 = delimiter
\82Ì
\8dÅ
\91å
\8e\9a\90\94 - 1
456 for i := 0 to delimiter.Count - 1 do begin
458 p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin
460 chSize := Length( delimiter[ i ] );
466 newMode := Modes( CharMode1[ Byte( p^ ) ] );
471 if (mode <> newMode) or delimited then begin
473 //
\95¶
\8e\9a\82Ì
\83^
\83C
\83v
\82ª
\95Ï
\8dX
\82³
\82ê
\82½
474 //
\82à
\82µ
\82
\82Í
\8bæ
\90Ø
\82è
\82É
\82È
\82é
\95¶
\8e\9a\82É
\91\98\8bö
\82µ
\82½
475 if mode <> ModeWhite then begin
476 SetLength( aWord, p - last );
477 CopyMemory( PChar( aWord ), last, p - last );
478 //aWord := Copy( last, 0, p - last );
479 idx := wordCount.IndexOf( aWord ); //
\92x
480 if idx < 0 then begin
481 countInfo := TWordCountInfo.Create;
482 wordCount.AddObject( aWord, countInfo );
484 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
486 countInfo.WordCount := countInfo.WordCount + 1;
497 if mode <> ModeWhite then begin
498 aWord := Copy( last, 0, p - last );
499 idx := wordCount.IndexOf( aWord );
500 if idx < 0 then begin
501 countInfo := TWordCountInfo.Create;
502 wordCount.AddObject( aWord, countInfo );
504 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
506 countInfo.WordCount := countInfo.WordCount + 1;
514 //==============================
516 //==============================
517 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
519 function p( const aWord : string ) : Single;
523 info := Objects[ aWord ];
526 else if info.NormalWord = 0 then
528 else if info.ImportantWord = 0 then
531 Result := ( info.ImportantWord / info.ImportantText ) /
532 ((info.NormalWord * 2 / info.NormalText ) +
533 (info.ImportantWord / info.ImportantText));
545 if wordCount.Count = 0 then
548 narray := TList.Create;
550 for i := 0 to wordCount.Count - 1 do begin
551 narray.Add( Pointer( p( wordCount[ i ] ) ) );
554 narray.Sort( AbsSort );
558 i := min( SAMPLE_COUNT, narray.Count );
561 s := s * Single( narray[ i ] );
562 q := q * (1 - Single( narray[ i ] ));
565 Result := s / (s + q);
572 //==============================
574 //==============================
575 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
577 function p( const aWord : string ) : Single;
581 info := Objects[ aWord ];
584 else if info.ImportantWord = 0 then
586 else if info.NormalWord = 0 then
589 Result := ( info.ImportantWord / info.ImportantText ) /
590 ((info.NormalWord / info.NormalText ) +
591 (info.ImportantWord / info.ImportantText));
594 function f( cnt : Integer; n, mean : Single ) : Extended;
598 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
603 narray : array of Single;
605 countInfo : TWordCountInfo;
608 important : Extended;
612 if wordCount.Count = 0 then begin
617 SetLength( narray, wordCount.Count );
619 for i := 0 to wordCount.Count - 1 do begin
620 n := p( wordCount[ i ] );
624 mean := mean / wordCount.Count;
629 for i := 0 to wordCount.Count - 1 do begin
630 countInfo := TWordCountInfo( wordCount.Objects[ i ] );
631 n := f( countInfo.WordCount, narray[ i ], mean );
632 normal := normal * n;
633 important := important * (1 - n);
634 if countInfo <> nil then
635 cnt := cnt + countInfo.WordCount;
639 normal := 1 - Exp( Ln( normal ) * (1 / cnt) );
640 important := 1 - Exp( Ln( important ) * (1 / cnt) );
642 n := (important - normal+ 0.00001) / (important + normal + 0.00001);
643 Result := (1 + n) / 2;
647 //==============================
649 //==============================
650 function TGikoBayesian.Parse(
652 wordCount : TWordCount;
653 algorithm : TGikoBayesianAlgorithm = gbaGaryRonbinson
657 CountWord( text, wordCount );
659 gbaPaulGraham: Result := CalcPaulGraham( wordCount );
660 gbaGaryRonbinson: Result := CalcGaryRobinson( wordCount );
666 //==============================
668 //==============================
669 procedure TGikoBayesian.Learn(
670 wordCount : TWordCount;
671 isImportant : Boolean );
674 wordinfo : TWordInfo;
675 countinfo : TWordCountInfo;
679 for i := 0 to wordCount.Count - 1 do begin
680 aWord := wordCount[ i ];
681 wordinfo := Objects[ aWord ];
682 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
683 if wordinfo = nil then begin
684 wordinfo := TWordInfo.Create;
685 Objects[ aWord ] := wordinfo;
688 if isImportant then begin
689 wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
690 wordinfo.ImportantText := wordinfo.ImportantText + 1;
692 wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
693 wordinfo.NormalText := wordinfo.NormalText + 1;
699 //==============================
701 //==============================
702 procedure TGikoBayesian.Forget(
703 wordCount : TWordCount;
704 isImportant : Boolean );
707 wordinfo : TWordInfo;
708 countinfo : TWordCountInfo;
712 for i := 0 to wordCount.Count - 1 do begin
713 aWord := wordCount[ i ];
714 wordinfo := Objects[ aWord ];
715 if wordinfo = nil then
718 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
719 if isImportant then begin
720 if wordInfo.ImportantText > 0 then begin
721 wordinfo.ImportantText := wordinfo.ImportantText - 1;
722 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
725 if wordinfo.NormalText > 0 then begin
726 wordinfo.NormalText := wordinfo.NormalText - 1;
727 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;