OSDN Git Service

平仮名を辞書に含めないようになった。
[gikonavigoeson/gikonavi.git] / GikoBayesian.pas
1 unit GikoBayesian;
2
3 {!
4 \file           GikoBayesian.pas
5 \brief  \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
6
7 $Id: GikoBayesian.pas,v 1.14 2004/11/01 09:51:57 yoffy Exp $
8 }
9
10 //! \95½\89¼\96¼\82ð\8e«\8f\91\82É\8aÜ\82ß\82È\82¢
11 {$DEFINE GIKO_BAYESIAN_NO_HIRAGANA_DIC}
12
13 interface
14
15 //==================================================
16 uses
17 //==================================================
18         Classes;
19
20 //==================================================
21 type
22 //==================================================
23
24         {!***********************************************************
25         \brief \92P\8cê\83v\83\8d\83p\83e\83B
26         ************************************************************}
27         TWordInfo       = class( TObject )
28         private
29                 FNormalWord                     :       Integer;        //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
30                 FImportantWord  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\93o\8fê\82µ\82½\89ñ\90\94
31                 FNormalText                     : Integer;      //!< \92Ê\8fí\82Ì\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
32                 FImportantText  : Integer;      //!< \92\8d\96Ú\92P\8cê\82Æ\82µ\82Ä\8aÜ\82Ü\82ê\82Ä\82¢\82½\95\8fÍ\82Ì\90\94
33
34         public
35                 property NormalWord                     : Integer       read FNormalWord write FNormalWord;
36                 property ImportantWord  : Integer       read FImportantWord write FImportantWord;
37                 property NormalText                     : Integer       read FNormalText write FNormalText;
38                 property ImportantText  : Integer       read FImportantText write FImportantText;
39         end;
40
41         {!***********************************************************
42         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83v\83\8d\83p\83e\83B
43         ************************************************************}
44         TWordCountInfo  = class( TObject )
45         private
46                 FWordCount      :       Integer;        //!< \92P\8cê\90\94
47
48         public
49                 property WordCount      : Integer       read FWordCount write FWordCount;
50         end;
51
52         {!***********************************************************
53         \brief \89ð\90Í\8dÏ\82Ý\92P\8cê\83\8a\83X\83g
54         ************************************************************}
55 //      TWordCount      = class( THashedStringList )    // \8c\83\92x
56         TWordCount      = class( TStringList )
57         public
58                 constructor Create;
59                 destructor Destroy; override;
60         end;
61
62         {!***********************************************************
63         \brief \83t\83B\83\8b\83^\83A\83\8b\83S\83\8a\83Y\83\80
64         ************************************************************}
65         TGikoBayesianAlgorithm =
66                 (gbaPaulGraham, gbaGaryRobinson, gbaGaryRobinsonFisher);
67
68         {!***********************************************************
69         \brief \83x\83C\83W\83A\83\93\83t\83B\83\8b\83^
70         ************************************************************}
71 //      TGikoBayesian = class( THashedStringList )      // \8c\83\92x
72         TGikoBayesian = class( TStringList )
73         private
74                 FFilePath       : string;       //!< \93Ç\82Ý\8d\9e\82ñ\82¾\83t\83@\83C\83\8b\83p\83X
75                 function GetObject( const name : string ) : TWordInfo;
76                 procedure SetObject( const name : string; value : TWordInfo );
77
78         public
79                 constructor Create;
80                 destructor Destroy; override;
81
82                 //! \83t\83@\83C\83\8b\82©\82ç\8aw\8fK\97\9a\97ð\82ð\93Ç\82Ý\8fo\82µ\82Ü\82·
83                 procedure LoadFromFile( const filePath : string );
84
85                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
86                 procedure SaveToFile( const filePath : string );
87
88                 //! \83t\83@\83C\83\8b\82É\8aw\8fK\97\9a\97ð\82ð\95Û\91\82µ\82Ü\82·
89                 procedure Save;
90
91                 //! \92P\8cê\82É\91Î\82·\82é\8fî\95ñ\82ð\8eæ\93¾\82µ\82Ü\82·
92                 property Objects[ const name : string ] : TWordInfo
93                         read GetObject write SetObject; default;
94
95                 //! \95\8fÍ\82É\8aÜ\82Ü\82ê\82é\92P\8cê\82ð\83J\83E\83\93\83g\82µ\82Ü\82·
96                 procedure CountWord(
97                         const text      : string;
98                         wordCount               : TWordCount );
99
100                 {!
101                 \brief  Paul Graham \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
102                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
103                 }
104                 function CalcPaulGraham( wordCount : TWordCount ) : Extended;
105
106                 {!
107                 \brief  GaryRobinson \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
108                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
109                 }
110                 function CalcGaryRobinson( wordCount : TWordCount ) : Extended;
111
112                 {!
113                 \brief  GaryRobinson-Fisher \96@\82É\8aî\82Ã\82¢\82Ä\95\8fÍ\82Ì\92\8d\96Ú\93x\82ð\8c\88\92è\82µ\82Ü\82·
114                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
115                 }
116                 function CalcGaryRobinsonFisher( wordCount : TWordCount ) : Extended;
117
118                 {!
119                 \brief  \95\8fÍ\82ð\89ð\90Í
120                 \param  text                                    \89ð\90Í\82·\82é\95\8fÍ
121                 \param  wordCount                       \89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g\82ª\95Ô\82é
122                 \param  algorithm                       \92\8d\96Ú\93x\82Ì\8c\88\92è\82É\97p\82¢\82é\83A\83\8b\83S\83\8a\83Y\83\80\82ð\8ew\92è\82µ\82Ü\82·
123                 \return \95\8fÍ\82Ì\92\8d\96Ú\93x (\92\8d\96Ú\82É\92l\82µ\82È\82¢ 0.0\81`1.0 \92\8d\96Ú\82·\82×\82«)
124
125                 CountWord \82Æ Calcxxxxx \82ð\82Ü\82Æ\82ß\82Ä\8eÀ\8ds\82·\82é\82¾\82¯\82Å\82·\81B
126                 }
127                 function Parse(
128                         const text                              : string;
129                         wordCount                                       : TWordCount;
130                         algorithm                                       : TGikoBayesianAlgorithm = gbaGaryRobinsonFisher
131                 ) : Extended;
132
133                 {!
134                 \brief  \8aw\8fK\82·\82é
135                 \param  wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
136                 \param  isImportant \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82é\82È\82ç True
137                 }
138                 procedure Learn(
139                         wordCount                : TWordCount;
140                         isImportant      : Boolean );
141
142                 {!
143                 \brief          \8aw\8fK\8c\8b\89Ê\82ð\96Y\82ê\82é
144                 \param          wordCount               Parse \82Å\89ð\90Í\82³\82ê\82½\92P\8cê\83\8a\83X\83g
145                 \param          isImportant     \92\8d\96Ú\82·\82×\82«\95\8fÍ\82Æ\82µ\82Ä\8ao\82¦\82ç\82ê\82Ä\82¢\82½\82È\82ç True
146                 \warning        \8aw\8fK\8dÏ\82Ý\82Ì\95\8fÍ\82©\82Ç\82¤\82©\82Í\8am\94F\8fo\97\88\82Ü\82¹\82ñ\81B<br>
147                                                         Learn \82µ\82Ä\82¢\82È\82¢\95\8fÍ\82â isImportant \82ª\8aÔ\88á\82Á\82Ä\82¢\82é\95\8fÍ\82ð
148                                                         Forget \82·\82é\82Æ\83f\81[\83^\83x\81[\83X\82ª\94j\91¹\82µ\82Ü\82·\81B<br>
149                                                         \8aw\8fK\8dÏ\82Ý\82©\82Ç\82¤\82©\82Í\93Æ\8e©\82É\8aÇ\97\9d\82µ\82Ä\82­\82¾\82³\82¢\81B
150
151                 \91S\82Ä\82Ì\8aw\8fK\8c\8b\89Ê\82ð\83N\83\8a\83A\82·\82é\82í\82¯\82Å\82Í\82 \82è\82Ü\82¹\82ñ\81B<br>
152                 wordCount \82ð\93¾\82½\95\8fÍ (Parse \82Ì text \88ø\90\94\82Ì\8aw\8fK\8c\8b\89Ê\82Ì\82Ý\83N\83\8a\83A\82µ\82Ü\82·\81B<br><br>
153
154                 \8eå\82É\92\8d\96Ú\95\8fÍ\82Æ\94ñ\92\8d\96Ú\95\8fÍ\82ð\90Ø\82è\91Ö\82¦\82é\82½\82ß\82É Forget -> Learn \82Ì\8f\87\82Å\8eg\97p\82µ\82Ü\82·\81B
155                 }
156                 procedure       Forget(
157                         wordCount               : TWordCount;
158                         isImportant     : Boolean );
159         end;
160
161 //==================================================
162 implementation
163 //==================================================
164
165 uses
166         SysUtils, Math, Windows,
167         MojuUtils;
168
169 const
170         GIKO_BAYESIAN_FILE_VERSION      = '1.0';
171 {
172         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeHanKana, ModeNum,
173                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
174                                                                 ModeWHira, ModeWKata, ModeWKanji);
175 }
176         CharMode1 : array [ 0..255 ] of Byte =
177         (
178                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
181                 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
182                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
183                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1,
184                 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
185                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
186
187                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189                 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
190                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
191                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
192                 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
193                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
195         );
196
197 //************************************************************
198 // misc
199 //************************************************************
200
201 //==============================
202 // RemoveToken
203 //==============================
204 function RemoveToken(var s: string;const delimiter: string): string;
205 var
206         p: Integer;
207 begin
208         p := AnsiPos(delimiter, s);
209         if p = 0 then
210                 Result := s
211         else
212                 Result := Copy(s, 1, p - 1);
213         s := Copy(s, Length(Result) + Length(delimiter) + 1, Length(s));
214 end;
215
216 //==============================
217 // AbsSort
218 //==============================
219 function AbsSort( p1, p2 : Pointer ) : Integer;
220 var
221         v1, v2 : Single;
222 begin
223
224         v1 := Abs( Single( p1 ) - 0.5 );
225         v2 := Abs( Single( p2 ) - 0.5 );
226         if v1 > v2 then
227                 Result := -1
228         else if v1 = v2 then
229                 Result := 0
230         else
231                 Result := 1;
232
233 end;
234
235 //************************************************************
236 // TWordCount class
237 //************************************************************
238 constructor TWordCount.Create;
239 begin
240
241                 Duplicates              := dupIgnore;
242                 CaseSensitive   := True;
243                 Sorted                          := True;
244
245 end;
246
247 destructor TWordCount.Destroy;
248 var
249         i : Integer;
250 begin
251
252         for i := Count - 1 downto 0 do
253                 if Objects[ i ] <> nil then
254                         Objects[ i ].Free;
255
256         inherited;
257
258 end;
259
260 //************************************************************
261 // TGikoBayesian class
262 //************************************************************
263
264 //==============================
265 // Create
266 //==============================
267 constructor TGikoBayesian.Create;
268 begin
269
270         Duplicates              := dupIgnore;
271         CaseSensitive   := True;
272         Sorted                          := True;
273
274 end;
275
276 //==============================
277 // Destroy
278 //==============================
279 destructor TGikoBayesian.Destroy;
280 var
281         i : Integer;
282 begin
283
284         for i := Count - 1 downto 0 do
285                 if inherited Objects[ i ] <> nil then
286                         inherited Objects[ i ].Free;
287
288         inherited;
289
290 end;
291
292 procedure TGikoBayesian.LoadFromFile( const filePath : string );
293 var
294         i                       : Integer;
295         sl              : TStringList;
296         s                       : string;
297         name    : string;
298         info    : TWordInfo;
299 begin
300
301         FFilePath := filePath;
302
303         if not FileExists( filePath ) then
304                 Exit;
305
306         sl := TStringList.Create;
307         try
308                 sl.LoadFromFile( filePath );
309
310                 for i := 1 to sl.Count - 1 do begin
311                         s := sl[ i ];
312                         name := RemoveToken( s, #1 );
313                         info := TWordInfo.Create;
314                         info.NormalWord                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
315                         info.ImportantWord      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
316                         info.NormalText                 := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
317                         info.ImportantText      := StrToIntDef( '$' + RemoveToken( s, #1 ), 0 );
318
319                         AddObject( name, info );
320                 end;
321         finally
322                 sl.Free;
323         end;
324
325 end;
326
327 procedure TGikoBayesian.SaveToFile( const filePath : string );
328 var
329         i                       : Integer;
330         sl              : TStringList;
331         s                       : string;
332         info    : TWordInfo;
333 begin
334
335         FFilePath := filePath;
336
337         sl := TStringList.Create;
338         try
339                 sl.BeginUpdate;
340                 sl.Add( GIKO_BAYESIAN_FILE_VERSION );
341
342                 for i := 0 to Count - 1 do begin
343                         info := TWordInfo( inherited Objects[ i ] );
344                         s := Strings[ i ] + #1
345                                  + Format('%x', [info.NormalWord]) + #1
346                                  + Format('%x', [info.ImportantWord]) + #1
347                                  + Format('%x', [info.NormalText]) + #1
348                                  + Format('%x', [info.ImportantText]);
349
350                         sl.Add(s);
351                 end;
352                 sl.EndUpdate;
353                 sl.SaveToFile( filePath );
354         finally
355                 sl.Free;
356         end;
357
358 end;
359
360 procedure TGikoBayesian.Save;
361 begin
362
363         if FFilePath <> '' then
364                 SaveToFile( FFilePath );
365
366 end;
367
368 //==============================
369 // GetObject
370 //==============================
371 function TGikoBayesian.GetObject( const name : string ) : TWordInfo;
372 var
373         idx : Integer;
374 begin
375
376         if Find( name, idx ) then
377                 Result := TWordInfo( inherited Objects[ idx ] )
378         else
379                 Result := nil;
380
381 end;
382
383 //==============================
384 // SetObject
385 //==============================
386 procedure TGikoBayesian.SetObject( const name : string; value : TWordInfo );
387 var
388         idx : Integer;
389 begin
390
391         if Find( name, idx ) then
392                 inherited Objects[ idx ] := value
393         else
394                 AddObject( name, value );
395
396 end;
397
398
399 //==============================
400 // CountWord
401 //==============================
402 procedure TGikoBayesian.CountWord(
403         const text      : string;
404         wordCount               : TWordCount );
405 type
406         Modes                           = (ModeWhite, ModeGraph, ModeAlpha, ModeNum, ModeHanKana,
407                                                                 ModeWGraph, ModeWAlpha, ModeWNum,
408                                                                 ModeWHira, ModeWKata, ModeWKanji);
409 var
410         p, tail, last                   : PChar;
411         mode, newMode                   : Modes;
412         ch                                                              : Longword;
413         chSize                                          : Integer;
414         wHiraDelimiter          : TStringList;
415         wHiraFinalDelimiter     : TStringList;
416         wKanjiDelimiter         : TStringList;
417         words                                                   : TStringList;
418         aWord                                                   : string;
419         countInfo                                       : TWordCountInfo;
420
421         function cutBoth( _aWord : string; _delim : TStringList ) : string;
422         var
423                 _i                      : Integer;
424         begin
425                 for _i := 0 to _delim.Count - 1 do begin
426                         _aWord := CustomStringReplace(
427                                 _aWord,
428                                 _delim[ _i ],
429                                 #10 + _delim[ _i ] + #10, False );
430                 end;
431                 Result := _aWord;
432         end;
433
434         function cutFirst( _aWord : string; _delim : TStringList ) : string;
435         var
436                 _i                      : Integer;
437         begin
438                 for _i := 0 to _delim.Count - 1 do begin
439                         _aWord := CustomStringReplace(
440                                 _aWord,
441                                 _delim[ _i ],
442                                 #10 + _delim[ _i ], False );
443                 end;
444                 Result := _aWord;
445         end;
446
447         function cutFinal( _aWord : string; _delim : TStringList ) : string;
448         var
449                 _i                      : Integer;
450         begin
451                 for _i := 0 to _delim.Count - 1 do begin
452                         _aWord := CustomStringReplace(
453                                 _aWord,
454                                 _delim[ _i ],
455                                 _delim[ _i ] + #10, False );
456                 end;
457                 Result := _aWord;
458         end;
459
460         procedure addWord( _dst : TWordCount; _words : TStringList );
461         var
462                 _aWord                  : string;
463                 _i, _idx                : Integer;
464                 _countInfo      : TWordCountInfo;
465         begin
466                 for _i := 0 to _words.Count - 1 do begin
467                         _aWord := _words[ _i ];
468                         if Length( _aWord ) > 0 then begin
469                                 if _dst.Find( _aWord, _idx ) then begin
470                                         _countInfo := TWordCountInfo( _dst.Objects[ _idx ] );
471                                 end else begin
472                                         _countInfo := TWordCountInfo.Create;
473                                         _dst.AddObject( _aWord, _countInfo );
474                                 end;
475                                 _countInfo.WordCount := _countInfo.WordCount + 1;
476                         end;
477                 end;
478         end;
479
480         function changeMode( _aWord : string; _mode : Modes ) : string;
481         var
482                 _i                                                                      : Integer;
483                 _aWord2                                                 : string;
484                 _pWord, _pWord2                 : PChar;
485                 _pWordTail, _pFound     : PChar;
486         const
487                 _delim : string = #10;
488         begin
489 {$IFDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
490                 if mode = ModeWHira then begin
491                         Result := '';
492                         Exit;
493                 end;
494 {$ENDIF}
495                 if Ord( _mode ) >= Ord( ModeWGraph ) then begin
496                         // \93ú\96{\8cê
497                         // \83X\83y\81[\83X\82ð\8bl\82ß\82é
498                         _aWord := CustomStringReplace( _aWord, ' ', '', False );
499                         _aWord := CustomStringReplace( _aWord, '\81@', '', False );
500
501                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
502                         case mode of
503                         ModeWHira:
504                                 begin
505                                         _aWord := cutFinal( _aWord, wHiraFinalDelimiter );
506                                         Result := cutBoth( _aWord, wHiraDelimiter );
507                                 end;
508
509                         ModeWKanji:
510                                 begin
511                                         // \83f\83\8a\83~\83^\82Å\92P\8cê\95ª\82¯
512                                         _aWord := cutBoth( _aWord, wKanjiDelimiter );
513                                         // 4 byte (2 \8e\9a\82¸\82Â\82Å\92P\8cê\95ª\82¯
514                                         _pWord := PChar( _aWord );
515                                         _i := Length( _aWord );
516                                         _pWordTail := _pWord + _i;
517                                         SetLength( _aWord2, _i + (_i shr 2) );
518                                         _pWord2 := PChar( _aWord2 );
519
520                                         while _pWord < _pWordTail do begin
521                                                 _pFound := AnsiStrPos( _pWord, PChar( _delim ) );
522                                                 if _pFound = nil then
523                                                         _pFound := _pWordTail;
524                                                 _pFound := _pFound - 3;
525
526                                                 while _pWord <= _pFound do begin
527                                                         CopyMemory( _pWord2, _pWord, 4 ); _pWord2[ 4 ] := #10;
528                                                         _pWord2 := _pWord2 + 5; _pWord := _pWord + 4;
529                                                 end;
530                                                 _i := _pFound + 4 - _pWord; // 4 = 3 + #10
531                                                 CopyMemory( _pWord2, _pWord, _i );
532                                                 _pWord2 := _pWord2 + _i; _pWord := _pWord + _i;
533                                         end;
534                                         if _pWord < _pWordTail then begin
535                                                 _i := _pWordTail - _pWord;
536                                                 CopyMemory( _pWord2, _pWord, _i );
537                                                 _pWord2 := _pWord2 + _i;
538                                         end;
539                                         SetLength( _aWord2, _pWord2 - PChar( _aWord2 ) );
540
541                                         Result := _aWord2;
542                                 end;
543
544                         else
545                                 Result := _aWord;
546                         end;
547                 end else begin
548                         Result := _aWord;
549                 end;
550         end;
551 const
552         WHIRA_DELIMITER = '\82ð' + #10 + '\82É' + #10 + '\82ª' + #10 + '\82Æ' + #10 + '\82©\82ç'
553                 + #10 + '\82Ö' + #10 + '\82æ\82è' + #10 + '\82Ü\82Å'+ #10 + '\82Å'
554                 + #10 + '\82±\82±' + #10 + '\82»\82±' + #10 + '\82Ç\82±'
555                 + #10 + '\82±\82ê' + #10 + '\82»\82ê' + #10 + '\82 \82ê' + #10 + '\82Ç\82ê'
556                 + #10 + '\82±\82Ì' + #10 + '\82»\82Ì' + #10 + '\82 \82Ì' + #10 + '\82Ç\82Ì'
557                 + #10 + '\82±\82¤' + #10 + '\82»\82¤' + #10 + '\82 \82 ' + #10 + '\82Ç\82¤'
558                 + #10 + '\82±\82ñ\82È' + #10 + '\82»\82ñ\82È' + #10 + '\82 \82ñ\82È' + #10 + '\82Ç\82ñ\82È'
559                 + #10 + '\82ê\82½' + #10 + '\82ê\82Ä' + #10 + '\82ê\82ê' + #10 + '\82ê\82ë'
560                 + #10 + '\82ê\82é' + #10 + '\82ç\82ê\82é'
561                 + #10 + '\82Å\82·' + #10 + '\82Ü\82·' + #10 + '\82Ü\82¹\82ñ'
562                 + #10 + '\82Å\82µ\82½' + #10 + '\82Ü\82µ\82½'
563                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82³\82ê\82é' + #10 + '\82³\82ê\82È\82¢'
564                 ;
565         WKANJI_DELIMITER = '\93I' + #10 + '\90«' + #10 + '\8e®' + #10 + '\89»' + #10 + '\96@'
566                 + #10 + '\95s' + #10 + '\96³' + #10 + '\94ñ' + #10 + '\94½'
567                 ;
568         WHIRA_FINAL_DELIMITER = '\82Á\82½' + #10 + '\82Á\82Ä'
569                 ;{
570                 + #10 + '\82æ\82Á\82Ä' + #10 + '\82µ\82½\82ª\82Á\82Ä' + #10 + '\82È\82Ì\82Å'
571                 + #10 + '\82¾\82©\82ç' + #10 + '\82Å\82·\82©\82ç'
572                 + #10 + '\82Ü\82½'
573                 + #10 + '\82µ\82©\82µ' + #10 + '\82¾\82ª' + #10 + '\82¯\82Ç' + #10 + '\82¯\82ê\82Ç'
574                 + #10 + '\82â\82Í\82è' + #10 + '\82â\82Á\82Ï\82è'
575                 + #10 + '\82Å\82µ' + #10 + '\82¾\82ë'
576                 + #10 + '\82·\82é' + #10 + '\82µ\82È\82¢' + #10 + '\82µ\82½' + #10 + '\82µ\82È\82¢'
577                 ;}
578         // '\81[' \82ð '\82\9f\82¡\82£\82¥\82§' \82É\81B
579         HA_LINE = '\82 \82©\82³\82½\82È\82Í\82Ü\82â\82ç\82í\82ª\82´\82¾\82Î\82Ï\82\9f\82ì';
580         HI_LINE = '\82¢\82«\82µ\82¿\82É\82Ð\82Ý\82è\82î\82¬\82\82Ñ\82Ò\82¡';
581         HU_LINE = '\82¤\82­\82·\82Â\82Ê\82Ó\82Þ\82ä\82é\82®\82Ô\82Õ\82£';
582         HE_LINE = '\82¦\82¯\82¹\82Ä\82Ë\82Ö\82ß\82ê\82ï\82°\82×\82Ø\82¥';
583         HO_LINE = '\82¨\82±\82»\82Æ\82Ì\82Ù\82à\82æ\82ë\82ð\82²\82Ú\82Û\82§';
584         KA_LINE = '\83A\83J\83T\83^\83i\83n\83}\83\84\83\89\83\8f\83K\83U\83_\83o\83p\83@\83\95\83\8e';
585         KI_LINE = '\83C\83L\83V\83`\83j\83q\83~\83\8a\83\90\83M\83W\83r\83s\83B';
586         KU_LINE = '\83E\83N\83X\83c\83k\83t\83\80\83\86\83\8b\83O\83u\83v\83D\83\94';
587         KE_LINE = '\83G\83P\83Z\83e\83l\83w\83\81\83\8c\83\91\83Q\83x\83y\83F\83\96';
588         KO_LINE = '\83I\83R\83\\83g\83m\83z\83\82\83\88\83\8d\83\92\83S\83{\83|\83H';
589         kKanji = [$80..$A0, $E0..$ff];
590 begin
591
592         wHiraDelimiter  := TStringList.Create;
593         wHiraFinalDelimiter := TStringList.Create;
594         wKanjiDelimiter := TStringList.Create;
595         words := TStringList.Create;
596         try
597                 mode := ModeWhite;
598 {$IFNDEF GIKO_BAYESIAN_NO_HIRAGANA_DIC}
599                 wHiraDelimiter.Text := WHIRA_DELIMITER;
600                 wHiraFinalDelimiter.Text := WHIRA_FINAL_DELIMITER;
601 {$ENDIF}
602                 wKanjiDelimiter.Text := WKANJI_DELIMITER;
603                 p                       := PChar( text );
604                 tail    := p + Length( text );
605                 last    := p;
606
607                 while p < tail do begin
608                         // \95\8e\9a\82Ì\83^\83C\83v\82ð\94»\95Ê
609                         // \81¦\8bå\93Ç\93_\82Í ModeGraph \82É\82È\82é\82Ì\82Å\8cÂ\95Ê\82É\91Î\89\9e\82µ\82È\82­\82Ä\82à\82¢\82¢
610 //                      if Byte(Byte( p^ ) - $a1) < $5e then begin
611                         if Byte( p^ ) in kKanji then begin
612                                 if p + 1 < tail then begin
613                                         ch := (PByte( p )^ shl 8) or PByte( p + 1 )^;
614                                         case ch of
615                                         // \83X\83y\81[\83X\82Å\92P\8cê\95ª\82¯\82¹\82¸\82É\8bl\82ß\82é
616                                         //$8140:                                                        newMode := ModeWhite;
617                                         $8141..$824e:                           newMode := ModeWGraph;
618                                         $824f..$8258:                           newMode := ModeWNum;
619                                         $8260..$829a:                           newMode := ModeWAlpha;
620                                         $829f..$82f1:                           newMode := ModeWHira;
621                                         $8340..$8396:                           newMode := ModeWKata;
622                                         else                                                            newMode := ModeWKanji;
623                                         end;
624                                         // '\81J\81K\81[' \82Í\95½\89¼\96¼\81A\82Ü\82½\82Í\83J\83^\83J\83i\82É\8aÜ\82Ü\82ê\82é
625                                         if (mode = ModeWHira) or (mode = ModeWKata) then
626                                                 if (ch = $814a) or (ch = $814b) or (ch = $815b) then
627                                                         newMode := mode;
628                                 end else begin
629                                         newMode := ModeWhite;
630                                 end;
631
632                                 chSize := 2;
633                         end else begin
634                                 newMode := Modes( CharMode1[ Byte( p^ ) ] );
635                                 if (p^ = ' ') and (Ord( mode ) >= Ord( ModeWGraph )) then begin
636                                         // \8d¡\82Ü\82Å\93ú\96{\8cê\82Å\8d¡\83X\83y\81[\83X
637                                         // \92P\8cê\82ð\8cq\82°\82Ä\8cã\82Å\83X\83y\81[\83X\82ð\8bl\82ß\82é
638                                         // \81¦\94¼\8ap\83J\83i\82Í\92Ê\8fí\83X\83y\81[\83X\82Å\8bæ\90Ø\82é\82¾\82ë\82¤\82©\82ç\8bl\82ß\82È\82¢
639                                         newMode := mode;
640                                 end;
641
642                                 chSize := 1;
643                         end;
644
645                         if mode <> newMode then begin
646
647                                 // \95\8e\9a\82Ì\83^\83C\83v\82ª\95Ï\8dX\82³\82ê\82½
648                                 if mode <> ModeWhite then begin
649                                         SetLength( aWord, p - last );
650                                         CopyMemory( PChar( aWord ), last, p - last );
651
652                                         words.Text := changeMode( aWord, mode );
653
654                                         // \92P\8cê\93o\98^
655                                         addWord( wordCount, words );
656                                 end;
657
658                                 last := p;
659                                 mode := newMode;
660
661                         end;
662
663                         p := p + chSize;
664                 end;    // while
665
666                 if mode <> ModeWhite then begin
667                         SetLength( aWord, p - last );
668                         CopyMemory( PChar( aWord ), last, p - last );
669
670                         words.Text := changeMode( aWord, mode );
671
672                         // \92P\8cê\93o\98^
673                         addWord( wordCount, words );
674                 end;
675         finally
676                 words.Free;
677                 wKanjiDelimiter.Free;
678                 wHiraFinalDelimiter.Free;
679                 wHiraDelimiter.Free;
680         end;
681
682 end;
683
684 //==============================
685 // CalcPaulGraham
686 //==============================
687 function TGikoBayesian.CalcPaulGraham( wordCount : TWordCount ) : Extended;
688
689         function p( const aWord : string ) : Single;
690         var
691                 info : TWordInfo;
692         begin
693                 info := Objects[ aWord ];
694                 if info = nil then
695                         Result := 0.415
696                 else if info.NormalWord = 0 then
697                         Result := 0.99
698                 else if info.ImportantWord = 0 then
699                         Result := 0.01
700                 else if info.ImportantWord + info.NormalWord * 2 < 5 then
701                         Result := 0.5
702                 else
703                         Result := ( info.ImportantWord / info.ImportantText ) /
704                                 ((info.NormalWord * 2 / info.NormalText ) +
705                                  (info.ImportantWord / info.ImportantText));
706         end;
707
708 var
709         s, q                            : Extended;
710         i                                               : Integer;
711         narray                  : TList;
712 const
713         SAMPLE_COUNT    = 15;
714 begin
715
716         Result := 1;
717         if wordCount.Count = 0 then
718                 Exit;
719
720         narray := TList.Create;
721         try
722                 for i := 0 to wordCount.Count - 1 do begin
723                         narray.Add( Pointer( p( wordCount[ i ] ) ) );
724                 end;
725
726                 narray.Sort( AbsSort );
727
728                 s := 1;
729                 q := 1;
730                 i := min( SAMPLE_COUNT, narray.Count );
731                 while i > 0 do begin
732                         Dec( i );
733
734                         s := s * Single( narray[ i ] );
735                         q := q * (1 - Single( narray[ i ] ));
736                 end;
737
738                 Result := s / (s + q);
739         finally
740                 narray.Free;
741         end;
742
743 end;
744
745 //==============================
746 // CalcGaryRobinson
747 //==============================
748 function TGikoBayesian.CalcGaryRobinson( wordCount : TWordCount ) : Extended;
749
750         function p( const aWord : string ) : Single;
751         var
752                 info : TWordInfo;
753         begin
754                 info := Objects[ aWord ];
755                 if info = nil then
756                         Result := 0.415
757                 else if info.ImportantWord = 0 then
758                         Result := 0.01
759                 else if info.NormalWord = 0 then
760                         Result := 0.99
761                 else
762                         Result := ( info.ImportantWord / info.ImportantText ) /
763                                 ((info.NormalWord / info.NormalText ) +
764                                  (info.ImportantWord / info.ImportantText));
765         end;
766
767         function f( cnt : Integer; n, mean : Single ) : Extended;
768         const
769                 k = 0.00001;
770         begin
771                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
772         end;
773
774 var
775         n                                               : Extended;
776         narray                  : array of Single;
777         mean                            : Extended;
778         countInfo               : TWordCountInfo;
779         i                                               : Integer;
780         P1, Q1, R1      : Extended;
781         cnt                                     : Extended;
782 begin
783
784         if wordCount.Count = 0 then begin
785                 Result := 1;
786                 Exit;
787         end;
788
789         SetLength( narray, wordCount.Count );
790         mean := 0;
791         for i := 0 to wordCount.Count - 1 do begin
792                 n                                               := p( wordCount[ i ] );
793                 narray[ i ]     := n;
794                 mean                            := mean + n;
795         end;
796         mean := mean / wordCount.Count;
797
798         P1 := 1;
799         Q1 := 1;
800         for i := 0 to wordCount.Count - 1 do begin
801                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
802                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
803                 P1 := P1 * ( 1 - n );
804                 Q1 := Q1 * n;
805         end;
806         cnt := wordCount.Count;
807         if cnt = 0 then
808                 cnt := 1
809         else
810         P1 := 1 - Power( P1, 1 / cnt );
811         Q1 := 1 - Power( Q1, 1 / cnt );
812
813         if P1 + Q1 = 0 then begin
814                 Result := 0.5
815         end else begin
816                 n := (P1 - Q1) / (P1 + Q1);
817                 Result := (1 + n) / 2;
818         end;
819
820 end;
821
822 //==============================
823 // CalcGaryRobinsonFisher
824 //==============================
825 function TGikoBayesian.CalcGaryRobinsonFisher(
826         wordCount : TWordCount
827 ) : Extended;
828
829         function p( const aWord : string ) : Single;
830         var
831                 info                            : TWordInfo;
832         begin
833                 info := Objects[ aWord ];
834                 if info = nil then
835                         Result := 0.415
836                 else if info.ImportantWord = 0 then
837                         Result := 0.01
838                 else if info.NormalWord = 0 then
839                         Result := 0.99
840                 else
841                         Result := info.ImportantWord /
842                                 (info.ImportantWord + info.NormalWord *
843                                  info.ImportantText / info.NormalText);
844         end;
845
846         function f( cnt : Integer; n, mean : Single ) : Extended;
847         const
848                 k = 0.00001;
849         begin
850                 Result := ( (k * mean) + (cnt * n) ) / (k + cnt);
851         end;
852
853         function prbx( x2, degree : Extended ) : Extended;
854         var
855                 m : Extended;
856                 sum : Extended;
857                 term : Extended;
858                 i : extended;
859         begin
860
861                 m := x2 / 2;
862                 sum := exp( -m );
863                 term := -m;
864
865                 i := 1;
866                 while i < (degree / 2 - 1) do begin
867                         term := term + ln( m / i );
868                         sum := sum + exp( term );
869                         i := i + 1;
870                 end;
871
872                 if sum < 1 then
873                         Result := sum
874                 else
875                         Result := 1.0;
876
877         end;
878
879 var
880         n                                               : Extended;
881         narray                  : array of Single;
882         mean                            : Extended;
883         countInfo               : TWordCountInfo;
884         i                                               : Integer;
885         normal                  : Extended;
886         important               : Extended;
887         P1, Q1                  : Extended;
888         cnt                                     : Extended;
889 begin
890
891         if wordCount.Count = 0 then begin
892                 Result := 1;
893                 Exit;
894         end;
895
896         SetLength( narray, wordCount.Count );
897         mean := 0;
898         for i := 0 to wordCount.Count - 1 do begin
899                 n                                               := p( wordCount[ i ] );
900                 narray[ i ]     := n;
901                 mean                            := mean + n;
902         end;
903         mean := mean / wordCount.Count;
904
905         cnt := 0;
906 (*
907         P1 := 1;
908         Q1 := 1;
909 (*)
910         P1 := 0;
911         Q1 := 0;
912 //*
913         for i := 0 to wordCount.Count - 1 do begin
914                 countInfo       := TWordCountInfo( wordCount.Objects[ i ] );
915                 n                                               := f( countInfo.WordCount, narray[ i ], mean );
916                 if countInfo <> nil then
917                         cnt := cnt + countInfo.WordCount;
918 (*
919                 P1 := P1 + Ln( 1 - n ) * countInfo.WordCount;
920                 Q1 := Q1 + Ln( n ) * countInfo.WordCount;
921 (*)
922                 P1 := P1 + Ln( 1 - n );
923                 Q1 := Q1 + Ln( n );
924 //*)
925         end;
926         if cnt = 0 then
927                 cnt := 1;
928 //(*
929         P1 := prbx( -2 * P1, 2 * cnt );
930         Q1 := prbx( -2 * Q1, 2 * cnt );
931 (*)
932         P1 := prbx( -2 * Ln( P1 ), 2 * cnt );
933         Q1 := prbx( -2 * Ln( Q1 ), 2 * cnt );
934 //*)
935         if P1 + Q1 = 0 then begin
936                 Result := 0.5
937         end else begin
938                 Result := (1 + Q1 + P1) / 2;
939         end;
940
941 end;
942
943 //==============================
944 // Parse
945 //==============================
946 function TGikoBayesian.Parse(
947         const text                              : string;
948         wordCount                                       : TWordCount;
949         algorithm                                       : TGikoBayesianAlgorithm
950 ) : Extended;
951 begin
952
953         CountWord( text, wordCount );
954         case algorithm of
955         gbaPaulGraham:          Result := CalcPaulGraham( wordCount );
956         gbaGaryRobinson:        Result := CalcGaryRobinson( wordCount );
957         gbaGaryRobinsonFisher:
958                                                                                 Result := CalcGaryRobinsonFisher( wordCount );
959         else                                                    Result := 0;
960         end;
961
962 end;
963
964 //==============================
965 // Learn
966 //==============================
967 procedure TGikoBayesian.Learn(
968         wordCount                : TWordCount;
969         isImportant      : Boolean );
970 var
971         aWord                   : string;
972         wordinfo        : TWordInfo;
973         countinfo       : TWordCountInfo;
974         i                                       : Integer;
975 begin
976
977         for i := 0 to wordCount.Count - 1 do begin
978                 aWord := wordCount[ i ];
979                 wordinfo := Objects[ aWord ];
980                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
981                 if wordinfo = nil then begin
982                         wordinfo := TWordInfo.Create;
983                         Objects[ aWord ] := wordinfo;
984                 end;
985
986                 if isImportant then begin
987                         wordinfo.ImportantWord := wordinfo.ImportantWord + countinfo.WordCount;
988                         wordinfo.ImportantText := wordinfo.ImportantText + 1;
989                 end else begin
990                         wordinfo.NormalWord := wordinfo.NormalWord + countinfo.WordCount;
991                         wordinfo.NormalText := wordinfo.NormalText + 1;
992                 end;
993         end;
994
995 end;
996
997 //==============================
998 // Forget
999 //==============================
1000 procedure       TGikoBayesian.Forget(
1001         wordCount               : TWordCount;
1002         isImportant     : Boolean );
1003 var
1004         aWord                   : string;
1005         wordinfo        : TWordInfo;
1006         countinfo       : TWordCountInfo;
1007         i                       : Integer;
1008 begin
1009
1010         for i := 0 to wordCount.Count - 1 do begin
1011                 aWord := wordCount[ i ];
1012                 wordinfo := Objects[ aWord ];
1013                 if wordinfo = nil then
1014                         Continue;
1015
1016                 countinfo := TWordCountInfo( wordCount.Objects[ i ] );
1017                 if isImportant then begin
1018                         if wordInfo.ImportantText > 0 then begin
1019                                 wordinfo.ImportantText := wordinfo.ImportantText - 1;
1020                                 wordinfo.ImportantWord := wordinfo.ImportantWord - countinfo.WordCount;
1021                         end;
1022                 end else begin
1023                         if wordinfo.NormalText > 0 then begin
1024                                 wordinfo.NormalText := wordinfo.NormalText - 1;
1025                                 wordinfo.NormalWord := wordinfo.NormalWord - countinfo.WordCount;
1026                         end;
1027                 end;
1028         end;
1029
1030 end;
1031
1032 end.