OSDN Git Service

辞書が壊れていてもいいように、値がマイナス値を取れないよう修正。
[gikonavigoeson/gikonavi.git] / GikoBayesian.pas
1 unit GikoBayesian;
2
3 {!
4 \file           GikoBayesian.pas
5 \brief  \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
6
7 $Id: GikoBayesian.pas,v 1.8 2004/10/21 05:59:39 yoffy Exp $
8 }
9
10 interface
11
12 //==================================================
13 uses
14 //==================================================
15         Classes;
16
17 //==================================================
18 type
19 //==================================================
20
21         {!***********************************************************
22         \brief \92P\8cê\83v\83\8d\83p\83e\83B
23         ************************************************************}
24         TWordInfo       = class( TObject )
25         private
26                 FNormalWord                     :       Integer;        //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
27                 FImportantWord  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
28                 FNormalText                     : Integer;      //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
29                 FImportantText  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
30
31         public
32                 property NormalWord                     : Integer       read FNormalWord write FNormalWord;
33                 property ImportantWord  : Integer       read FImportantWord write FImportantWord;
34                 property NormalText                     : Integer       read FNormalText write FNormalText;
35                 property ImportantText  : Integer       read FImportantText write FImportantText;
36         end;
37
38         {!***********************************************************
39         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83v\83\8d\83p\83e\83B
40         ************************************************************}
41         TWordCountInfo  = class( TObject )
42         private
43                 FWordCount      :       Integer;        //!< \92P\8cê\90\94
44
45         public
46                 property WordCount      : Integer       read FWordCount write FWordCount;
47         end;
48
49         {!***********************************************************
50         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83\8a\83X\83g
51         ************************************************************}
52 //      TWordCount      = class( THashedStringList )    // \8c\83\92x
53         TWordCount      = class( TStringList )
54         public
55                 constructor Create;
56                 destructor Destroy; override;
57         end;
58
59         {!***********************************************************
60         \brief \83t\83B\83\8b\83^\83A\83\8b\83S\83\8a\83Y\83\80
61         ************************************************************}
62         TGikoBayesianAlgorithm =
63                 (gbaPaulGraham, gbaGaryRonbinson{, gbaGaryRonbinsonFisher});
64
65         {!***********************************************************
66         \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
67         ************************************************************}
68 //      TGikoBayesian = class( THashedStringList )      // \8c\83\92x
69         TGikoBayesian = class( TStringList )
70         private
71                 FFilePath       : string;       //!< \93Ç\82Ý\8d\9e\82ñ\82¾\83t\83@\83C\83\8b\83p\83X
72                 function GetObject( const name : string ) : TWordInfo;
73                 procedure SetObject( const name : string; value : TWordInfo );
74
75         public
76                 constructor Create;
77                 destructor Destroy; override;
78
79                 //! \83t\83@\83C\83\8b\82©\82ç\8aw\8fK\97\9a\97ð\82ð\93Ç\82Ý\8fo\82µ\82Ü\82·
80                 procedure LoadFromFile( const filePath : string );
81
82                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
83                 procedure SaveToFile( const filePath : string );
84
85                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
86                 procedure Save;
87
88                 //! \92P\8cê\82É\91Î\82·\82é\8fî\95ñ\82ð\8eæ\93¾\82µ\82Ü\82·
89                 property Objects[ const name : string ] : TWordInfo
90                         read GetObject write SetObject; default;
91
92                 //! \95\8fÍ\82É\8aÜ\82Ü\82ê\82é\92P\8cê\82ð\83J\83E\83\93\83g\82µ\82Ü\82·
93                 procedure CountWord(
94                         const text      : string;
95                         wordCount               : TWordCount );
96
97                 {!
98                 \brief  Paul Graham \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
99                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
100                 }
101                 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
102
103                 {!
104                 \brief  GaryRobinson \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
105                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
106                 }
107                 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
108
109 //              function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
110
111                 {!
112                 \brief  \95\8fÍ\82ð\89ð\90Í
113                 \param  text                                    \89ð\90Í\82·\82é\95\8fÍ
114                 \param  wordCount                       \89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g\82ª\95Ô\82é
115                 \param  algorithm                       \92\8d\96Ú\93x\82Ì\8c\88\92è\82É\97p\82¢\82é\83A\83\8b\83S\83\8a\83Y\83\80\82ð\8ew\92è\82µ\82Ü\82·
116                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
117
118                 CountWord \82Æ Calcxxxxx \82ð\82Ü\82Æ\82ß\82Ä\8eÀ\8ds\82·\82é\82¾\82¯\82Å\82·\81B
119                 }
120                 function Parse(
121                         const text                              : string;
122                         wordCount                                       : TWordCount;
123                         algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRonbinson
124                 ) : Extended;
125
126                 {!
127                 \brief  \8aw\8fK\82·\82é
128                 \param  wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
129                 \param  isImportant \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82é\82È\82ç True
130                 }
131                 procedure Learn(
132                         wordCount                : TWordCount;
133                         isImportant      : Boolean );
134
135                 {!
136                 \brief          \8aw\8fK\8c\8b\89Ê\82ð\96Y\82ê\82é
137                 \param          wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
138                 \param          isImportant     \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82ç\82ê\82Ä\82¢\82½\82È\82ç True
139                 \warning        \8aw\8fK\8dÏ\82Ý\82Ì\95\8fÍ\82©\82Ç\82¤\82©\82Í\8am\94F\8fo\97\88\82Ü\82¹\82ñ\81B<br>
140                                                         Learn \82µ\82Ä\82¢\82È\82¢\95\8fÍ\82â isImportant \82ª\8aÔ\88á\82Á\82Ä\82¢\82é\95\8fÍ\82ð
141                                                         Forget \82·\82é\82Æ\83f\81[\83^\83x\81[\83X\82ª\94j\91¹\82µ\82Ü\82·\81B<br>
142                                                         \8aw\8fK\8dÏ\82Ý\82©\82Ç\82¤\82©\82Í\93Æ\8e©\82É\8aÇ\97\9d\82µ\82Ä\82­\82¾\82³\82¢\81B
143
144                 \91S\82Ä\82Ì\8aw\8fK\8c\8b\89Ê\82ð\83N\83\8a\83A\82·\82é\82í\82¯\82Å\82Í\82 \82è\82Ü\82¹\82ñ\81B<br>
145                 wordCount \82ð\93¾\82½\95\8fÍ (Parse \82Ì text \88ø\90\94\82Ì\8aw\8fK\8c\8b\89Ê\82Ì\82Ý\83N\83\8a\83A\82µ\82Ü\82·\81B<br><br>
146
147                 \8eå\82É\92\8d\96Ú\95\8fÍ\82Æ\94ñ\92\8d\96Ú\95\8fÍ\82ð\90Ø\82è\91Ö\82¦\82é\82½\82ß\82É Forget -> Learn \82Ì\8f\87\82Å\8eg\97p\82µ\82Ü\82·\81B
148                 }
149                 procedure       Forget(
150                         wordCount               : TWordCount;
151                         isImportant     : Boolean );
152         end;
153
154 //==================================================
155 implementation
156 //==================================================
157
158 uses
159         SysUtils, Math, Windows;
160
161 const
162         GIKO_BAYESIAN_FILE_VERSION      = '1.0';
163 {
164         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
165                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
166                                                                 ModeWHira, ModeWKata, ModeWKanji);
167 }
168         CharMode1 : array [ 0..255 ] of Byte =
169         (
170                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173                 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
174                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
175                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
176                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
177                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
178
179                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181                 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
182                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
183                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
184                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
185                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
187         );
188
189 //************************************************************
190 // misc
191 //************************************************************
192
193 //==============================
194 // RemoveToken
195 //==============================
196 function RemoveToken(var s: string;const delimiter: string): string;
197 var
198         p: Integer;
199 begin
200         p := AnsiPos(delimiter, s);
201         if p = 0 then
202                 Result := s
203         else
204                 Result := Copy(s, 1, p - 1);
205         s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
206 end;
207
208 //==============================
209 // AbsSort
210 //==============================
211 function AbsSort( p1, p2 : Pointer ) : Integer;
212 var
213         v1, v2 : Single;
214 begin
215
216         v1 := Abs( Single( p1 ) - 0.5 );
217         v2 := Abs( Single( p2 ) - 0.5 );
218         if v1 > v2 then
219                 Result := -1
220         else if v1 = v2 then
221                 Result := 0
222         else
223                 Result := 1;
224
225 end;
226
227 //************************************************************
228 // TWordCount class
229 //************************************************************
230 constructor TWordCount.Create;
231 begin
232
233                 Duplicates              := dupIgnore;
234                 CaseSensitive   := True;
235                 Sorted                          := True;
236
237 end;
238
239 destructor TWordCount.Destroy;
240 var
241         i : Integer;
242 begin
243
244         for i := Count - 1 downto 0 do
245                 if Objects[ i ] <> nil then
246                         Objects[ i ].Free;
247
248         inherited;
249
250 end;
251
252 //************************************************************
253 // TGikoBayesian class
254 //************************************************************
255
256 //==============================
257 // Create
258 //==============================
259 constructor TGikoBayesian.Create;
260 begin
261
262         Duplicates              := dupIgnore;
263         CaseSensitive   := True;
264         Sorted                          := True;
265
266 end;
267
268 //==============================
269 // Destroy
270 //==============================
271 destructor TGikoBayesian.Destroy;
272 var
273         i : Integer;
274 begin
275
276         for i := Count - 1 downto 0 do
277                 if inherited Objects[ i ] <> nil then
278                         inherited Objects[ i ].Free;
279
280         inherited;
281
282 end;
283
284 procedure TGikoBayesian.LoadFromFile( const filePath : string );
285 var
286         i                       : Integer;
287         sl              : TStringList;
288         s                       : string;
289         name    : string;
290         info    : TWordInfo;
291 begin
292
293         FFilePath := filePath;
294
295         if not FileExists( filePath ) then
296                 Exit;
297
298         sl := TStringList.Create;
299         try
300                 sl.LoadFromFile( filePath );
301
302                 for i := 1 to sl.Count - 1 do begin
303                         s := sl[ i ];
304                         name := RemoveToken( s, #1 );
305                         info := TWordInfo.Create;
306                         info.NormalWord                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
307                         info.ImportantWord      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
308                         info.NormalText                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
309                         info.ImportantText      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
310
311                         AddObject( name, info );
312                 end;
313         finally
314                 sl.Free;
315         end;
316
317 end;
318
319 procedure TGikoBayesian.SaveToFile( const filePath : string );
320 var
321         i                       : Integer;
322         sl              : TStringList;
323         s                       : string;
324         info    : TWordInfo;
325 begin
326
327         FFilePath := filePath;
328
329         sl := TStringList.Create;
330         try
331                 sl.BeginUpdate;
332                 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
333
334                 for i := 0 to Count - 1 do begin
335                         info := TWordInfo( inherited Objects[ i ] );
336                         s := Strings[ i ] + #1
337                                  + Format('%x', [info.NormalWord]) + #1
338                                  + Format('%x', [info.ImportantWord]) + #1
339                                  + Format('%x', [info.NormalText]) + #1
340                                  + Format('%x', [info.ImportantText]);
341
342                         sl.Add(s);
343                 end;
344                 sl.EndUpdate;
345                 sl.SaveToFile( filePath );
346         finally
347                 sl.Free;
348         end;
349
350 end;
351
352 procedure TGikoBayesian.Save;
353 begin
354
355         if FFilePath <> '' then
356                 SaveToFile( FFilePath );
357
358 end;
359
360 //==============================
361 // GetObject
362 //==============================
363 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
364 var
365         idx : Integer;
366 begin
367
368         idx := IndexOf( name ); // \8c\83\92x
369         if idx < 0 then
370                 Result := nil
371         else
372                 Result := TWordInfo( inherited Objects[ idx ] );
373
374 end;
375
376 //==============================
377 // SetObject
378 //==============================
379 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
380 var
381         idx : Integer;
382 begin
383
384         idx := IndexOf( name );
385         if idx < 0 then
386                 AddObject( name, value )
387         else
388                 inherited Objects[ idx ] := value;
389
390 end;
391
392
393 //==============================
394 // CountWord
395 //==============================
396 procedure TGikoBayesian.CountWord(
397         const text      : string;
398         wordCount               : TWordCount );
399 type
400         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
401                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
402                                                                 ModeWHira, ModeWKata, ModeWKanji);
403 var
404         p, tail, last   : PChar;
405         mode, newMode   : Modes;
406         aWord                                   : string;
407         ch                                              : Longword;
408         chSize                          : Integer;
409         delimiter                       : TStringList;
410         delimited                       : Boolean;
411         i, idx                          : Integer;
412         countInfo                       : TWordCountInfo;
413 const
414         KAKUJOSI = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç' +
415                 #10 + '\82Å' + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å';
416         kKanji = [$80..$A0, $E0..$ff];
417 begin
418
419         delimiter := TStringList.Create;
420         try
421                 mode := ModeWhite;
422                 delimiter.Text := KAKUJOSI;
423                 p                       := PChar( text );
424                 tail    := p + Length( text );
425                 last    := p;
426
427                 while p < tail do begin
428                         delimited := False;
429                         // \95\8e\9a\82Ì\83^\83C\83v\82ð\94»\95Ê
430                         // \81¦\8bå\93Ç\93_\82Í ModeGraph \82É\82È\82é\82Ì\82Å\8cÂ\95Ê\82É\91Î\89\9e\82µ\82È\82­\82Ä\82à\82¢\82¢
431 //                      if Byte(Byte( p^ ) - $a1) < $5e then begin
432                         if Byte( p^ ) in kKanji then begin
433                                 if p + 1 < tail then begin
434                                         ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
435                                         case ch of
436                                         $8140:                                                  newMode := ModeWhite;
437                                         $8141..$824e:                           newMode := ModeWGraph;
438                                         $824f..$8258:                           newMode := ModeWNum;
439                                         $8260..$829a:                           newMode := ModeWAlpha;
440                                         $829f..$82f1:                           newMode := ModeWHira;
441                                         $8340..$8396:                           newMode := ModeWKata;
442                                         else                                                            newMode := ModeWKanji;
443                                         end;
444                                         // '\81J\81K\81[' \82Í\95½\89¼\96¼\81A\82Ü\82½\82Í\83J\83^\83J\83i\82É\8aÜ\82Ü\82ê\82é
445                                         if (mode = ModeWHira) or (mode = ModeWKata) then
446                                                 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
447                                                         newMode := mode;
448                                 end else begin
449                                         newMode := ModeWhite;
450                                 end;
451
452                                 chSize := 2;
453
454                                 // \8bæ\90Ø\82è\82É\82È\82é\95\8e\9a\82ª\82 \82é\82©\8c\9f\8d¸\82·\82é
455                                 if p + 3 < tail then begin      // 3 = delimiter \82Ì\8dÅ\91å\8e\9a\90\94 - 1
456                                         for i := 0 to delimiter.Count - 1 do begin
457                                                 if CompareMem(
458                                                         p, PChar( delimiter[ i ] ), Length( delimiter[ i ] ) ) then begin
459                                                         delimited := True;
460                                                         chSize := Length( delimiter[ i ] );
461                                                         Break;
462                                                 end;
463                                         end;
464                                 end;
465                         end else begin
466                                 newMode := Modes( CharMode1[ Byte( p^ ) ] );
467
468                                 chSize := 1;
469                         end;
470
471                         if (mode <> newMode) or delimited then begin
472
473                                 // \95\8e\9a\82Ì\83^\83C\83v\82ª\95Ï\8dX\82³\82ê\82½
474                                 // \82à\82µ\82­\82Í\8bæ\90Ø\82è\82É\82È\82é\95\8e\9a\82É\91\98\8bö\82µ\82½
475                                 if mode <> ModeWhite then begin
476                                         SetLength( aWord, p - last );
477                                         CopyMemory( PChar( aWord ), last, p - last );
478                                         //aWord := Copy( last, 0, p - last );
479                                         idx := wordCount.IndexOf( aWord );      // \92x
480                                         if idx < 0 then begin
481                                                 countInfo := TWordCountInfo.Create;
482                                                 wordCount.AddObject( aWord, countInfo );
483                                         end else begin
484                                                 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
485                                         end;
486                                         countInfo.WordCount := countInfo.WordCount + 1;
487                                 end;
488
489                                 last := p;
490                                 mode := newMode;
491
492                         end;
493
494                         p := p + chSize;
495                 end;    // while
496
497                 if mode <> ModeWhite then begin
498                         aWord := Copy( last, 0, p - last );
499                         idx := wordCount.IndexOf( aWord );
500                         if idx < 0 then begin
501                                 countInfo := TWordCountInfo.Create;
502                                 wordCount.AddObject( aWord, countInfo );
503                         end else begin
504                                 countInfo := TWordCountInfo( wordCount.Objects[ idx ] );
505                         end;
506                         countInfo.WordCount := countInfo.WordCount + 1;
507                 end;
508         finally
509                 delimiter.Free;
510         end;
511
512 end;
513
514 //==============================
515 // CalcPaulGraham
516 //==============================
517 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
518
519         function p( const aWord : string ) : Single;
520         var
521                 info : TWordInfo;
522         begin
523                 info := Objects[ aWord ];
524                 if info = nil then
525                         Result := 0.4
526                 else if info.NormalWord = 0 then
527                         Result := 0.99
528                 else if info.ImportantWord = 0 then
529                         Result := 0.01
530                 else
531                         Result := ( info.ImportantWord / info.ImportantText ) /
532                                 ((info.NormalWord * 2 / info.NormalText ) +
533                                  (info.ImportantWord / info.ImportantText));
534         end;
535
536 var
537         s, q                            : Extended;
538         i                                               : Integer;
539         narray                  : TList;
540 const
541         SAMPLE_COUNT    = 15;
542 begin
543
544         Result := 1;
545         if wordCount.Count = 0 then
546                 Exit;
547
548         narray := TList.Create;
549         try
550                 for i := 0 to wordCount.Count - 1 do begin
551                         narray.Add( Pointer( p( wordCount[ i ] ) ) );
552                 end;
553
554                 narray.Sort( AbsSort );
555
556                 s := 1;
557                 q := 1;
558                 i := min( SAMPLE_COUNT, narray.Count );
559                 while i > 0 do begin
560                         Dec( i );
561                         s := s * Single( narray[ i ] );
562                         q := q * (1 - Single( narray[ i ] ));
563                 end;
564
565                 Result := s / (s + q);
566         finally
567                 narray.Free;
568         end;
569
570 end;
571
572 //==============================
573 // CalcGaryRobinson
574 //==============================
575 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
576
577         function p( const aWord : string ) : Single;
578         var
579                 info : TWordInfo;
580         begin
581                 info := Objects[ aWord ];
582                 if info = nil then
583                         Result := 0.415
584                 else if info.ImportantWord = 0 then
585                         Result := 0.0001
586                 else if info.NormalWord = 0 then
587                         Result := 0.9999
588                 else
589                         Result := ( info.ImportantWord / info.ImportantText ) /
590                                 ((info.NormalWord / info.NormalText ) +
591                                  (info.ImportantWord / info.ImportantText));
592         end;
593
594         function f( cnt : Integer; n, mean : Single ) : Extended;
595         const
596                 k = 0.00001;
597         begin
598                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
599         end;
600
601 var
602         n                                               : Extended;
603         narray                  : array of Single;
604         mean                            : Extended;
605         countInfo               : TWordCountInfo;
606         i                                               : Integer;
607         normal                  : Extended;
608         important               : Extended;
609         cnt                                     : Extended;
610 begin
611
612         if wordCount.Count = 0 then begin
613                 Result := 1;
614                 Exit;
615         end;
616
617         SetLength( narray, wordCount.Count );
618         mean := 0;
619         for i := 0 to wordCount.Count - 1 do begin
620                 n                                               := p( wordCount[ i ] );
621                 narray[ i ]     := n;
622                 mean                            := mean + n;
623         end;
624         mean := mean / wordCount.Count;
625
626         cnt                             := 0;
627         normal          := 1;
628         important       := 1;
629         for i := 0 to wordCount.Count - 1 do begin
630                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
631                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
632                 normal                  := normal * n;
633                 important               := important * (1 - n);
634                 if countInfo <> nil then
635                         cnt                                     := cnt + countInfo.WordCount;
636         end;
637         if cnt = 0 then
638                 cnt := 1;
639         normal := 1 - Exp( Ln( normal ) * (1 / cnt) );
640         important := 1 - Exp( Ln( important ) * (1 / cnt) );
641
642         n := (important - normal+ 0.00001) / (important + normal + 0.00001);
643         Result := (1 + n) / 2;
644
645 end;
646
647 //==============================
648 // Parse
649 //==============================
650 function TGikoBayesian.Parse(
651         const text                              : string;
652         wordCount                                       : TWordCount;
653         algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRonbinson
654 ) : Extended;
655 begin
656
657         CountWord( text, wordCount );
658         case algorithm of
659         gbaPaulGraham:          Result := CalcPaulGraham( wordCount );
660         gbaGaryRonbinson:       Result := CalcGaryRobinson( wordCount );
661         else                                                    Result := 0;
662         end;
663
664 end;
665
666 //==============================
667 // Learn
668 //==============================
669 procedure TGikoBayesian.Learn(
670         wordCount                : TWordCount;
671         isImportant      : Boolean );
672 var
673         aWord                   : string;
674         wordinfo        : TWordInfo;
675         countinfo       : TWordCountInfo;
676         i                                       : Integer;
677 begin
678
679         for i := 0 to wordCount.Count - 1 do begin
680                 aWord := wordCount[ i ];
681                 wordinfo := Objects[ aWord ];
682                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
683                 if wordinfo = nil then begin
684                         wordinfo := TWordInfo.Create;
685                         Objects[ aWord ] := wordinfo;
686                 end;
687
688                 if isImportant then begin
689                         wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
690                         wordinfo.ImportantText := wordinfo.ImportantText + 1;
691                 end else begin
692                         wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
693                         wordinfo.NormalText := wordinfo.NormalText + 1;
694                 end;
695         end;
696
697 end;
698
699 //==============================
700 // Forget
701 //==============================
702 procedure       TGikoBayesian.Forget(
703         wordCount               : TWordCount;
704         isImportant     : Boolean );
705 var
706         aWord                   : string;
707         wordinfo        : TWordInfo;
708         countinfo       : TWordCountInfo;
709         i                       : Integer;
710 begin
711
712         for i := 0 to wordCount.Count - 1 do begin
713                 aWord := wordCount[ i ];
714                 wordinfo := Objects[ aWord ];
715                 if wordinfo = nil then
716                         Continue;
717
718                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
719                 if isImportant then begin
720                         if wordInfo.ImportantText > 0 then begin
721                                 wordinfo.ImportantText := wordinfo.ImportantText - 1;
722                                 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
723                         end;
724                 end else begin
725                         if wordinfo.NormalText > 0 then begin
726                                 wordinfo.NormalText := wordinfo.NormalText - 1;
727                                 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
728                         end;
729                 end;
730         end;
731
732 end;
733
734 end.