1 #include "license.hunspell"
\r
2 #include "license.myspell"
\r
4 #ifndef MOZILLA_CLIENT
\r
10 #include <stdlib.h>
\r
16 #include "affentry.hxx"
\r
17 #include "csutil.hxx"
\r
19 #ifndef MOZILLA_CLIENT
\r
21 using namespace std;
\r
26 PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
\r
28 // register affix manager
\r
31 // set up its intial values
\r
33 aflag = dp->aflag; // flag
\r
34 strip = dp->strip; // string to strip
\r
35 appnd = dp->appnd; // string to append
\r
36 stripl = dp->stripl; // length of strip string
\r
37 appndl = dp->appndl; // length of append string
\r
38 numconds = dp->numconds; // number of conditions to match
\r
39 opts = dp->opts; // cross product flag
\r
40 // then copy over all of the conditions
\r
41 memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
\r
45 #ifdef HUNSPELL_EXPERIMENTAL
\r
46 morphcode = dp->morphcode;
\r
48 contclass = dp->contclass;
\r
49 contclasslen = dp->contclasslen;
\r
53 PfxEntry::~PfxEntry()
\r
56 if (appnd) free(appnd);
\r
57 if (strip) free(strip);
\r
61 if (opts & aeUTF8) {
\r
62 for (int i = 0; i < 8; i++) {
\r
63 if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
\r
66 #ifdef HUNSPELL_EXPERIMENTAL
\r
67 if (morphcode && !(opts & aeALIASM)) free(morphcode);
\r
69 if (contclass && !(opts & aeALIASF)) free(contclass);
\r
72 // add prefix to this word assuming conditions hold
\r
73 char * PfxEntry::add(const char * word, int len)
\r
75 char tword[MAXWORDUTF8LEN + 4];
\r
77 if ((len > stripl) && (len >= numconds) && test_condition(word) &&
\r
78 (!stripl || (strncmp(word, strip, stripl) == 0)) &&
\r
79 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
\r
80 /* we have a match so add prefix */
\r
83 strcpy(tword,appnd);
\r
86 strcpy(pp, (word + stripl));
\r
87 return mystrdup(tword);
\r
93 inline int PfxEntry::test_condition(const char * st)
\r
96 unsigned char * cp = (unsigned char *)st;
\r
97 if (!(opts & aeUTF8)) { // 256-character codepage
\r
98 for (cond = 0; cond < numconds; cond++) {
\r
99 if ((conds.base[*cp++] & (1 << cond)) == 0) return 0;
\r
101 } else { // UTF-8 encoding
\r
103 for (cond = 0; cond < numconds; cond++) {
\r
104 // a simple 7-bit ASCII character in UTF-8
\r
105 if ((*cp >> 7) == 0) {
\r
106 // also check limit (end of word)
\r
107 if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0;
\r
108 // UTF-8 multibyte character
\r
110 // not dot wildcard in rule
\r
111 if (!conds.utf8.all[cond]) {
\r
112 if (conds.utf8.neg[cond]) {
\r
113 u8_u16((w_char *) &wc, 1, (char *) cp);
\r
114 if (conds.utf8.wchars[cond] &&
\r
115 flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
\r
116 wc, (short) conds.utf8.wlen[cond])) return 0;
\r
118 if (!conds.utf8.wchars[cond]) return 0;
\r
119 u8_u16((w_char *) &wc, 1, (char *) cp);
\r
120 if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
\r
121 wc, (short)conds.utf8.wlen[cond])) return 0;
\r
124 // jump to next UTF-8 character
\r
125 for(cp++; (*cp & 0xc0) == 0x80; cp++);
\r
133 // check if this prefix entry matches
\r
134 struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)
\r
136 int tmpl; // length of tmpword
\r
137 struct hentry * he; // hash entry of root word or NULL
\r
138 char tmpword[MAXWORDUTF8LEN + 4];
\r
140 // on entry prefix is 0 length or already matches the beginning of the word.
\r
141 // So if the remaining root word has positive length
\r
142 // and if there are enough chars in root word and added back strip chars
\r
143 // to meet the number of characters conditions, then test it
\r
145 tmpl = len - appndl;
\r
147 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
\r
149 // generate new root word by removing prefix and adding
\r
150 // back any characters that would have been stripped
\r
152 if (stripl) strcpy (tmpword, strip);
\r
153 strcpy ((tmpword + stripl), (word + appndl));
\r
155 // now make sure all of the conditions on characters
\r
156 // are met. Please see the appendix at the end of
\r
157 // this file for more info on exactly what is being
\r
160 // if all conditions are met then check if resulting
\r
161 // root word in the dictionary
\r
163 if (test_condition(tmpword)) {
\r
165 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
\r
167 if (TESTAFF(he->astr, aflag, he->alen) &&
\r
168 // forbid single prefixes with pseudoroot flag
\r
169 ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
\r
171 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
\r
172 (contclass && TESTAFF(contclass, needflag, contclasslen))))
\r
174 he = he->next_homonym; // check homonyms
\r
178 // prefix matched but no root word was found
\r
179 // if aeXPRODUCT is allowed, try again but now
\r
180 // ross checked combined with a suffix
\r
182 //if ((opts & aeXPRODUCT) && in_compound) {
\r
183 if ((opts & aeXPRODUCT)) {
\r
184 he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL,
\r
185 0, NULL, FLAG_NULL, needflag, in_compound);
\r
193 // check if this prefix entry matches
\r
194 struct hentry * PfxEntry::check_twosfx(const char * word, int len,
\r
195 char in_compound, const FLAG needflag)
\r
197 int tmpl; // length of tmpword
\r
198 struct hentry * he; // hash entry of root word or NULL
\r
199 char tmpword[MAXWORDUTF8LEN + 4];
\r
201 // on entry prefix is 0 length or already matches the beginning of the word.
\r
202 // So if the remaining root word has positive length
\r
203 // and if there are enough chars in root word and added back strip chars
\r
204 // to meet the number of characters conditions, then test it
\r
206 tmpl = len - appndl;
\r
208 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
\r
210 // generate new root word by removing prefix and adding
\r
211 // back any characters that would have been stripped
\r
213 if (stripl) strcpy (tmpword, strip);
\r
214 strcpy ((tmpword + stripl), (word + appndl));
\r
216 // now make sure all of the conditions on characters
\r
217 // are met. Please see the appendix at the end of
\r
218 // this file for more info on exactly what is being
\r
221 // if all conditions are met then check if resulting
\r
222 // root word in the dictionary
\r
224 if (test_condition(tmpword)) {
\r
227 // prefix matched but no root word was found
\r
228 // if aeXPRODUCT is allowed, try again but now
\r
229 // cross checked combined with a suffix
\r
231 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
\r
232 he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, needflag);
\r
240 #ifdef HUNSPELL_EXPERIMENTAL
\r
241 // check if this prefix entry matches
\r
242 char * PfxEntry::check_twosfx_morph(const char * word, int len,
\r
243 char in_compound, const FLAG needflag)
\r
245 int tmpl; // length of tmpword
\r
246 char tmpword[MAXWORDUTF8LEN + 4];
\r
248 // on entry prefix is 0 length or already matches the beginning of the word.
\r
249 // So if the remaining root word has positive length
\r
250 // and if there are enough chars in root word and added back strip chars
\r
251 // to meet the number of characters conditions, then test it
\r
253 tmpl = len - appndl;
\r
255 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
\r
257 // generate new root word by removing prefix and adding
\r
258 // back any characters that would have been stripped
\r
260 if (stripl) strcpy (tmpword, strip);
\r
261 strcpy ((tmpword + stripl), (word + appndl));
\r
263 // now make sure all of the conditions on characters
\r
264 // are met. Please see the appendix at the end of
\r
265 // this file for more info on exactly what is being
\r
268 // if all conditions are met then check if resulting
\r
269 // root word in the dictionary
\r
271 if (test_condition(tmpword)) {
\r
274 // prefix matched but no root word was found
\r
275 // if aeXPRODUCT is allowed, try again but now
\r
276 // ross checked combined with a suffix
\r
278 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
\r
279 return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
\r
280 aeXPRODUCT, (AffEntry *)this, needflag);
\r
287 // check if this prefix entry matches
\r
288 char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
\r
290 int tmpl; // length of tmpword
\r
291 struct hentry * he; // hash entry of root word or NULL
\r
292 char tmpword[MAXWORDUTF8LEN + 4];
\r
293 char result[MAXLNLEN];
\r
298 // on entry prefix is 0 length or already matches the beginning of the word.
\r
299 // So if the remaining root word has positive length
\r
300 // and if there are enough chars in root word and added back strip chars
\r
301 // to meet the number of characters conditions, then test it
\r
303 tmpl = len - appndl;
\r
305 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
\r
307 // generate new root word by removing prefix and adding
\r
308 // back any characters that would have been stripped
\r
310 if (stripl) strcpy (tmpword, strip);
\r
311 strcpy ((tmpword + stripl), (word + appndl));
\r
313 // now make sure all of the conditions on characters
\r
314 // are met. Please see the appendix at the end of
\r
315 // this file for more info on exactly what is being
\r
318 // if all conditions are met then check if resulting
\r
319 // root word in the dictionary
\r
321 if (test_condition(tmpword)) {
\r
323 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
\r
325 if (TESTAFF(he->astr, aflag, he->alen) &&
\r
326 // forbid single prefixes with pseudoroot flag
\r
327 ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
\r
329 ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
\r
330 (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
\r
331 if (morphcode) strcat(result, morphcode); else strcat(result,getKey());
\r
332 if (he->description) {
\r
333 if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word);
\r
334 strcat(result,he->description);
\r
336 strcat(result, "\n");
\r
338 he = he->next_homonym;
\r
342 // prefix matched but no root word was found
\r
343 // if aeXPRODUCT is allowed, try again but now
\r
344 // ross checked combined with a suffix
\r
346 if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
\r
347 st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this,
\r
348 FLAG_NULL, needflag);
\r
350 strcat(result, st);
\r
357 if (*result) return mystrdup(result);
\r
360 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
\r
362 SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
\r
364 // register affix manager
\r
367 // set up its intial values
\r
368 aflag = dp->aflag; // char flag
\r
369 strip = dp->strip; // string to strip
\r
370 appnd = dp->appnd; // string to append
\r
371 stripl = dp->stripl; // length of strip string
\r
372 appndl = dp->appndl; // length of append string
\r
373 numconds = dp->numconds; // number of conditions to match
\r
374 opts = dp->opts; // cross product flag
\r
376 // then copy over all of the conditions
\r
377 memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
\r
379 rappnd = myrevstrdup(appnd);
\r
381 #ifdef HUNSPELL_EXPERIMENTAL
\r
382 morphcode = dp->morphcode;
\r
384 contclass = dp->contclass;
\r
385 contclasslen = dp->contclasslen;
\r
389 SfxEntry::~SfxEntry()
\r
392 if (appnd) free(appnd);
\r
393 if (rappnd) free(rappnd);
\r
394 if (strip) free(strip);
\r
398 if (opts & aeUTF8) {
\r
399 for (int i = 0; i < 8; i++) {
\r
400 if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
\r
403 #ifdef HUNSPELL_EXPERIMENTAL
\r
404 if (morphcode && !(opts & aeALIASM)) free(morphcode);
\r
406 if (contclass && !(opts & aeALIASF)) free(contclass);
\r
409 // add suffix to this word assuming conditions hold
\r
410 char * SfxEntry::add(const char * word, int len)
\r
412 char tword[MAXWORDUTF8LEN + 4];
\r
414 /* make sure all conditions match */
\r
415 if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) &&
\r
416 (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
\r
417 ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
\r
418 /* we have a match so add suffix */
\r
419 strcpy(tword,word);
\r
421 strcpy(tword + len - stripl, appnd);
\r
423 *(tword + len - stripl) = '\0';
\r
425 return mystrdup(tword);
\r
431 inline int SfxEntry::test_condition(const char * st, const char * beg)
\r
434 unsigned char * cp = (unsigned char *) st;
\r
435 if (!(opts & aeUTF8)) { // 256-character codepage
\r
436 // Dömölki affix algorithm
\r
437 for (cond = numconds; --cond >= 0; ) {
\r
438 if ((conds.base[*--cp] & (1 << cond)) == 0) return 0;
\r
440 } else { // UTF-8 encoding
\r
442 for (cond = numconds; --cond >= 0; ) {
\r
443 // go to next character position and check limit
\r
444 if ((char *) --cp < beg) return 0;
\r
445 // a simple 7-bit ASCII character in UTF-8
\r
446 if ((*cp >> 7) == 0) {
\r
447 if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0;
\r
448 // UTF-8 multibyte character
\r
450 // go to first character of UTF-8 multibyte character
\r
451 for (; (*cp & 0xc0) == 0x80; cp--);
\r
452 // not dot wildcard in rule
\r
453 if (!conds.utf8.all[cond]) {
\r
454 if (conds.utf8.neg[cond]) {
\r
455 u8_u16((w_char *) &wc, 1, (char *) cp);
\r
456 if (conds.utf8.wchars[cond] &&
\r
457 flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
\r
458 wc, (short) conds.utf8.wlen[cond])) return 0;
\r
460 if (!conds.utf8.wchars[cond]) return 0;
\r
461 u8_u16((w_char *) &wc, 1, (char *) cp);
\r
462 if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
\r
463 wc, (short)conds.utf8.wlen[cond])) return 0;
\r
474 // see if this suffix is present in the word
\r
475 struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,
\r
476 AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,
\r
477 const FLAG badflag)
\r
479 int tmpl; // length of tmpword
\r
480 struct hentry * he; // hash entry pointer
\r
481 unsigned char * cp;
\r
482 char tmpword[MAXWORDUTF8LEN + 4];
\r
483 PfxEntry* ep = (PfxEntry *) ppfx;
\r
485 // if this suffix is being cross checked with a prefix
\r
486 // but it does not support cross products skip it
\r
488 if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
\r
491 // upon entry suffix is 0 length or already matches the end of the word.
\r
492 // So if the remaining root word has positive length
\r
493 // and if there are enough chars in root word and added back strip chars
\r
494 // to meet the number of characters conditions, then test it
\r
496 tmpl = len - appndl;
\r
497 // the second condition is not enough for UTF-8 strings
\r
498 // it checked in test_condition()
\r
500 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
\r
502 // generate new root word by removing suffix and adding
\r
503 // back any characters that would have been stripped or
\r
504 // or null terminating the shorter string
\r
506 strcpy (tmpword, word);
\r
507 cp = (unsigned char *)(tmpword + tmpl);
\r
509 strcpy ((char *)cp, strip);
\r
511 cp = (unsigned char *)(tmpword + tmpl);
\r
514 // now make sure all of the conditions on characters
\r
515 // are met. Please see the appendix at the end of
\r
516 // this file for more info on exactly what is being // tested
\r
518 // if all conditions are met then check if resulting
\r
519 // root word in the dictionary
\r
521 if (test_condition((char *) cp, (char *) tmpword)) {
\r
523 #ifdef SZOSZABLYA_POSSIBLE_ROOTS
\r
524 fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
\r
526 if ((he = pmyMgr->lookup(tmpword)) != NULL) {
\r
528 // check conditional suffix (enabled by prefix)
\r
529 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
\r
530 TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
\r
531 (((optflags & aeXPRODUCT) == 0) ||
\r
532 TESTAFF(he->astr, ep->getFlag(), he->alen) ||
\r
533 // enabled by prefix
\r
534 ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
\r
536 // handle cont. class
\r
538 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
\r
540 // check only in compound homonyms (bad flags)
\r
541 (!badflag || !TESTAFF(he->astr, badflag, he->alen)
\r
543 // handle required flag
\r
545 (TESTAFF(he->astr, needflag, he->alen) ||
\r
546 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
\r
549 he = he->next_homonym; // check homonyms
\r
552 // obsolote stemming code (used only by the
\r
553 // experimental SuffixMgr:suggest_pos_stems)
\r
554 // store resulting root in wlst
\r
555 } else if (wlst && (*ns < maxSug)) {
\r
557 for (int k=0; k < *ns; k++)
\r
558 if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
\r
560 wlst[*ns] = mystrdup(tmpword);
\r
561 if (wlst[*ns] == NULL) {
\r
562 for (int j=0; j<*ns; j++) free(wlst[j]);
\r
574 // see if two-level suffix is present in the word
\r
575 struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
\r
576 AffEntry* ppfx, const FLAG needflag)
\r
578 int tmpl; // length of tmpword
\r
579 struct hentry * he; // hash entry pointer
\r
580 unsigned char * cp;
\r
581 char tmpword[MAXWORDUTF8LEN + 4];
\r
582 PfxEntry* ep = (PfxEntry *) ppfx;
\r
585 // if this suffix is being cross checked with a prefix
\r
586 // but it does not support cross products skip it
\r
588 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
\r
591 // upon entry suffix is 0 length or already matches the end of the word.
\r
592 // So if the remaining root word has positive length
\r
593 // and if there are enough chars in root word and added back strip chars
\r
594 // to meet the number of characters conditions, then test it
\r
596 tmpl = len - appndl;
\r
598 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
\r
600 // generate new root word by removing suffix and adding
\r
601 // back any characters that would have been stripped or
\r
602 // or null terminating the shorter string
\r
604 strcpy (tmpword, word);
\r
605 cp = (unsigned char *)(tmpword + tmpl);
\r
607 strcpy ((char *)cp, strip);
\r
609 cp = (unsigned char *)(tmpword + tmpl);
\r
612 // now make sure all of the conditions on characters
\r
613 // are met. Please see the appendix at the end of
\r
614 // this file for more info on exactly what is being
\r
617 // if all conditions are met then recall suffix_check
\r
619 if (test_condition((char *) cp, (char *) tmpword)) {
\r
621 // handle conditional suffix
\r
622 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
\r
623 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
\r
625 he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
\r
627 he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
\r
635 #ifdef HUNSPELL_EXPERIMENTAL
\r
636 // see if two-level suffix is present in the word
\r
637 char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
\r
638 AffEntry* ppfx, const FLAG needflag)
\r
640 int tmpl; // length of tmpword
\r
641 unsigned char * cp;
\r
642 char tmpword[MAXWORDUTF8LEN + 4];
\r
643 PfxEntry* ep = (PfxEntry *) ppfx;
\r
646 char result[MAXLNLEN];
\r
650 // if this suffix is being cross checked with a prefix
\r
651 // but it does not support cross products skip it
\r
653 if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
\r
656 // upon entry suffix is 0 length or already matches the end of the word.
\r
657 // So if the remaining root word has positive length
\r
658 // and if there are enough chars in root word and added back strip chars
\r
659 // to meet the number of characters conditions, then test it
\r
661 tmpl = len - appndl;
\r
663 if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
\r
665 // generate new root word by removing suffix and adding
\r
666 // back any characters that would have been stripped or
\r
667 // or null terminating the shorter string
\r
669 strcpy (tmpword, word);
\r
670 cp = (unsigned char *)(tmpword + tmpl);
\r
672 strcpy ((char *)cp, strip);
\r
674 cp = (unsigned char *)(tmpword + tmpl);
\r
677 // now make sure all of the conditions on characters
\r
678 // are met. Please see the appendix at the end of
\r
679 // this file for more info on exactly what is being
\r
682 // if all conditions are met then recall suffix_check
\r
684 if (test_condition((char *) cp, (char *) tmpword)) {
\r
686 // handle conditional suffix
\r
687 if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
\r
688 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
\r
690 if (((PfxEntry *) ppfx)->getMorph()) {
\r
691 strcat(result, ((PfxEntry *) ppfx)->getMorph());
\r
698 st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
\r
700 strcat(result, st);
\r
706 st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
\r
708 strcat(result, st);
\r
713 if (*result) return mystrdup(result);
\r
718 #endif // END OF HUNSPELL_EXPERIMENTAL CODE
\r
720 // get next homonym with same affix
\r
721 struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx,
\r
722 const FLAG cclass, const FLAG needflag)
\r
724 PfxEntry* ep = (PfxEntry *) ppfx;
\r
726 while (he->next_homonym) {
\r
727 he = he->next_homonym;
\r
728 if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
\r
729 ((optflags & aeXPRODUCT) == 0 ||
\r
730 TESTAFF(he->astr, ep->getFlag(), he->alen) ||
\r
731 // handle conditional suffix
\r
732 ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
\r
734 // handle cont. class
\r
736 ((contclass) && TESTAFF(contclass, cclass, contclasslen))
\r
738 // handle required flag
\r
740 (TESTAFF(he->astr, needflag, he->alen) ||
\r
741 ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
\r
751 Appendix: Understanding Affix Code
\r
754 An affix is either a prefix or a suffix attached to root words to make
\r
757 Basically a Prefix or a Suffix is set of AffEntry objects
\r
758 which store information about the prefix or suffix along
\r
759 with supporting routines to check if a word has a particular
\r
760 prefix or suffix or a combination.
\r
762 The structure affentry is defined as follows:
\r
766 unsigned short aflag; // ID used to represent the affix
\r
767 char * strip; // string to strip before adding affix
\r
768 char * appnd; // the affix string to add
\r
769 unsigned char stripl; // length of the strip string
\r
770 unsigned char appndl; // length of the affix string
\r
771 char numconds; // the number of conditions that must be met
\r
772 char opts; // flag: aeXPRODUCT- combine both prefix and suffix
\r
773 char conds[SETSIZE]; // array which encodes the conditions to be met
\r
777 Here is a suffix borrowed from the en_US.aff file. This file
\r
778 is whitespace delimited.
\r
782 SFX D y ied [^aeiou]y
\r
784 SFX D 0 ed [aeiou]y
\r
786 This information can be interpreted as follows:
\r
788 In the first line has 4 fields
\r
792 1 SFX - indicates this is a suffix
\r
793 2 D - is the name of the character flag which represents this suffix
\r
794 3 Y - indicates it can be combined with prefixes (cross product)
\r
795 4 4 - indicates that sequence of 4 affentry structures are needed to
\r
796 properly store the affix information
\r
798 The remaining lines describe the unique information for the 4 SfxEntry
\r
799 objects that make up this affix. Each line can be interpreted
\r
800 as follows: (note fields 1 and 2 are as a check against line 1 info)
\r
804 1 SFX - indicates this is a suffix
\r
805 2 D - is the name of the character flag for this affix
\r
806 3 y - the string of chars to strip off before adding affix
\r
807 (a 0 here indicates the NULL string)
\r
808 4 ied - the string of affix characters to add
\r
809 5 [^aeiou]y - the conditions which must be met before the affix
\r
812 Field 5 is interesting. Since this is a suffix, field 5 tells us that
\r
813 there are 2 conditions that must be met. The first condition is that
\r
814 the next to the last character in the word must *NOT* be any of the
\r
815 following "a", "e", "i", "o" or "u". The second condition is that
\r
816 the last character of the word must end in "y".
\r
818 So how can we encode this information concisely and be able to
\r
819 test for both conditions in a fast manner? The answer is found
\r
820 but studying the wonderful ispell code of Geoff Kuenning, et.al.
\r
821 (now available under a normal BSD license).
\r
823 If we set up a conds array of 256 bytes indexed (0 to 255) and access it
\r
824 using a character (cast to an unsigned char) of a string, we have 8 bits
\r
825 of information we can store about that character. Specifically we
\r
826 could use each bit to say if that character is allowed in any of the
\r
827 last (or first for prefixes) 8 characters of the word.
\r
829 Basically, each character at one end of the word (up to the number
\r
830 of conditions) is used to index into the conds array and the resulting
\r
831 value found there says whether the that character is valid for a
\r
832 specific character position in the word.
\r
834 For prefixes, it does this by setting bit 0 if that char is valid
\r
835 in the first position, bit 1 if valid in the second position, and so on.
\r
837 If a bit is not set, then that char is not valid for that postion in the
\r
840 If working with suffixes bit 0 is used for the character closest
\r
841 to the front, bit 1 for the next character towards the end, ...,
\r
842 with bit numconds-1 representing the last char at the end of the string.
\r
844 Note: since entries in the conds[] are 8 bits, only 8 conditions
\r
845 (read that only 8 character positions) can be examined at one
\r
846 end of a word (the beginning for prefixes and the end for suffixes.
\r
848 So to make this clearer, lets encode the conds array values for the
\r
849 first two affentries for the suffix D described earlier.
\r
852 For the first affentry:
\r
853 numconds = 1 (only examine the last character)
\r
855 conds['e'] = (1 << 0) (the word must end in an E)
\r
856 all others are all 0
\r
858 For the second affentry:
\r
859 numconds = 2 (only examine the last two characters)
\r
861 conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)
\r
862 where X is all characters *but* a, e, i, o, or u
\r
865 conds['y'] = (1 << 1) (the last char must be a y)
\r
866 all other bits for all other entries in the conds array are zero
\r