1 #include "license.hunspell"
\r
2 #include "license.myspell"
\r
4 #ifndef MOZILLA_CLIENT
\r
10 #include <stdlib.h>
\r
16 #include "hashmgr.hxx"
\r
17 #include "csutil.hxx"
\r
18 #include "atypes.hxx"
\r
20 #ifdef MOZILLA_CLIENT
\r
21 #ifdef __SUNPRO_CC // for SunONE Studio compiler
\r
22 using namespace std;
\r
26 using namespace std;
\r
30 // build a hash table from a munched word list
\r
32 HashMgr::HashMgr(const char * tpath, const char * apath)
\r
36 flag_mode = FLAG_CHAR;
\r
37 complexprefixes = 0;
\r
40 ignorechars_utf16 = NULL;
\r
41 ignorechars_utf16_len = 0;
\r
46 load_config(apath);
\r
47 int ec = load_tables(tpath);
\r
49 /* error condition - what should we do here */
\r
50 HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n",ec);
\r
62 // now pass through hash table freeing up everything
\r
63 // go through column by column of the table
\r
64 for (int i=0; i < tablesize; i++) {
\r
65 struct hentry * pt = &tableptr[i];
\r
66 struct hentry * nt = NULL;
\r
68 if (pt->astr && !aliasf) free(pt->astr);
\r
69 if (pt->word) free(pt->word);
\r
70 #ifdef HUNSPELL_EXPERIMENTAL
\r
71 if (pt->description && !aliasm) free(pt->description);
\r
77 if (pt->astr && !aliasf) free(pt->astr);
\r
78 if (pt->word) free(pt->word);
\r
79 #ifdef HUNSPELL_EXPERIMENTAL
\r
80 if (pt->description && !aliasm) free(pt->description);
\r
91 for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
\r
100 for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
\r
105 if (ignorechars) free(ignorechars);
\r
106 if (ignorechars_utf16) free(ignorechars_utf16);
\r
109 // lookup a root word in the hashtable
\r
111 struct hentry * HashMgr::lookup(const char *word) const
\r
113 struct hentry * dp;
\r
115 dp = &tableptr[hash(word)];
\r
116 if (dp->word == NULL) return NULL;
\r
117 for ( ; dp != NULL; dp = dp->next) {
\r
118 if (strcmp(word,dp->word) == 0) return dp;
\r
124 // add a word to the hash table (private)
\r
126 int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, const char * desc)
\r
128 char * st = mystrdup(word);
\r
129 if (wl && !st) return 1;
\r
130 if (ignorechars != NULL) {
\r
132 remove_ignored_chars_utf(st, ignorechars_utf16, ignorechars_utf16_len);
\r
134 remove_ignored_chars(st, ignorechars);
\r
137 if (complexprefixes) {
\r
138 if (utf8) reverseword_utf(st); else reverseword(st);
\r
141 struct hentry * dp = &tableptr[i];
\r
142 if (dp->word == NULL) {
\r
143 dp->wlen = (short) wl;
\r
144 dp->alen = (short) al;
\r
148 dp->next_homonym = NULL;
\r
149 #ifdef HUNSPELL_EXPERIMENTAL
\r
151 dp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
\r
153 dp->description = mystrdup(desc);
\r
154 if (desc && !dp->description) return 1;
\r
155 if (dp->description && complexprefixes) {
\r
156 if (utf8) reverseword_utf(dp->description); else reverseword(dp->description);
\r
161 struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry));
\r
163 hp->wlen = (short) wl;
\r
164 hp->alen = (short) al;
\r
168 hp->next_homonym = NULL;
\r
169 #ifdef HUNSPELL_EXPERIMENTAL
\r
171 hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
\r
173 hp->description = mystrdup(desc);
\r
174 if (desc && !hp->description) return 1;
\r
175 if (dp->description && complexprefixes) {
\r
176 if (utf8) reverseword_utf(hp->description); else reverseword(hp->description);
\r
180 while (dp->next != NULL) {
\r
181 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
\r
184 if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
\r
190 // add a custom dic. word to the hash table (public)
\r
191 int HashMgr::put_word(const char * word, int wl, char * aff)
\r
193 unsigned short * flags;
\r
196 al = decode_flags(&flags, aff);
\r
197 flag_qsort(flags, 0, al);
\r
201 add_word(word, wl, flags, al, NULL);
\r
205 int HashMgr::put_word_pattern(const char * word, int wl, const char * pattern)
\r
207 unsigned short * flags;
\r
208 struct hentry * dp = lookup(pattern);
\r
209 if (!dp || !dp->astr) return 1;
\r
210 flags = (unsigned short *) malloc (dp->alen * sizeof(short));
\r
211 memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
\r
212 add_word(word, wl, flags, dp->alen, NULL);
\r
216 // walk the hash table entry by entry - null at end
\r
217 struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
\r
220 if ((col < 0) || (hp == NULL)) {
\r
225 if (hp && hp->next != NULL) {
\r
229 hp = (col < tablesize) ? &tableptr[col] : NULL;
\r
230 // search for next non-blank column entry
\r
231 while (hp && (hp->word == NULL)) {
\r
233 hp = (col < tablesize) ? &tableptr[col] : NULL;
\r
235 if (col < tablesize) return hp;
\r
242 // load a munched word list and build a hash table on the fly
\r
243 int HashMgr::load_tables(const char * tpath)
\r
248 unsigned short * flags;
\r
250 // raw dictionary - munched file
\r
251 FILE * rawdict = fopen(tpath, "r");
\r
252 if (rawdict == NULL) return 1;
\r
254 // first read the first line of file to get hash table size */
\r
256 if (! fgets(ts, MAXDELEN-1,rawdict)) return 2;
\r
259 /* remove byte order mark */
\r
260 if (strncmp(ts,"",3) == 0) {
\r
261 memmove(ts, ts+3, strlen(ts+3)+1);
\r
262 HUNSPELL_WARNING(stderr, "warning: dic file begins with byte order mark: possible incompatibility with old Hunspell versions\n");
\r
265 if ((*ts < '1') || (*ts > '9')) HUNSPELL_WARNING(stderr, "error - missing word count in dictionary file\n");
\r
266 tablesize = atoi(ts);
\r
267 if (!tablesize) return 4;
\r
268 tablesize = tablesize + 5 + USERWORD;
\r
269 if ((tablesize %2) == 0) tablesize++;
\r
271 // allocate the hash table
\r
272 tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry));
\r
273 if (! tableptr) return 3;
\r
274 for (int i=0; i<tablesize; i++) tableptr[i].word = NULL;
\r
276 // loop through all words on much list and add to hash
\r
277 // table and create word and affix strings
\r
279 while (fgets(ts,MAXDELEN-1,rawdict)) {
\r
281 // split each line into word and morphological description
\r
282 dp = strchr(ts,'\t');
\r
291 // split each line into word and affix char strings
\r
292 // "\/" signs slash in words (not affix separator)
\r
293 // "/" at beginning of the line is word character (not affix separator)
\r
294 ap = strchr(ts,'/');
\r
299 } else if (*(ap - 1) != '\\') break;
\r
300 // replace "\/" with "/"
\r
301 for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
\r
302 ap = strchr(ap,'/');
\r
308 int index = atoi(ap + 1);
\r
309 al = get_aliasf(index, &flags);
\r
311 HUNSPELL_WARNING(stderr, "error - bad flag vector alias: %s\n", ts);
\r
315 al = decode_flags(&flags, ap + 1);
\r
316 flag_qsort(flags, 0, al);
\r
326 // add the word and its index
\r
327 if (add_word(ts,wl,flags,al,dp)) return 5;
\r
336 // the hash function is a simple load and rotate
\r
337 // algorithm borrowed
\r
339 int HashMgr::hash(const char * word) const
\r
342 for (int i=0; i < 4 && *word != 0; i++)
\r
343 hv = (hv << 8) | (*word++);
\r
344 while (*word != 0) {
\r
345 ROTATE(hv,ROTATE_LEN);
\r
348 return (unsigned long) hv % tablesize;
\r
351 int HashMgr::decode_flags(unsigned short ** result, char * flags) {
\r
353 switch (flag_mode) {
\r
354 case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
\r
355 len = strlen(flags);
\r
356 if (len%2 == 1) HUNSPELL_WARNING(stderr, "error: length of FLAG_LONG flagvector is odd: %s\n", flags);
\r
358 *result = (unsigned short *) malloc(len * sizeof(short));
\r
359 for (int i = 0; i < len; i++) {
\r
360 (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1];
\r
364 case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
\r
366 char * src = flags;
\r
367 unsigned short * dest;
\r
369 for (p = flags; *p; p++) {
\r
370 if (*p == ',') len++;
\r
372 *result = (unsigned short *) malloc(len * sizeof(short));
\r
374 for (p = flags; *p; p++) {
\r
376 *dest = (unsigned short) atoi(src);
\r
377 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
\r
382 *dest = (unsigned short) atoi(src);
\r
383 if (*dest == 0) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
\r
386 case FLAG_UNI: { // UTF-8 characters
\r
387 w_char w[MAXDELEN/2];
\r
388 len = u8_u16(w, MAXDELEN/2, flags);
\r
389 *result = (unsigned short *) malloc(len * sizeof(short));
\r
390 memcpy(*result, w, len * sizeof(short));
\r
393 default: { // Ispell's one-character flags (erfg -> e r f g)
\r
394 unsigned short * dest;
\r
395 len = strlen(flags);
\r
396 *result = (unsigned short *) malloc(len * sizeof(short));
\r
398 for (unsigned char * p = (unsigned char *) flags; *p; p++) {
\r
399 *dest = (unsigned short) *p;
\r
407 unsigned short HashMgr::decode_flag(const char * f) {
\r
408 unsigned short s = 0;
\r
409 switch (flag_mode) {
\r
411 s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
\r
414 s = (unsigned short) atoi(f);
\r
417 u8_u16((w_char *) &s, 1, f);
\r
420 s = (unsigned short) *((unsigned char *)f);
\r
422 if (!s) HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
\r
426 char * HashMgr::encode_flag(unsigned short f) {
\r
427 unsigned char ch[10];
\r
428 if (f==0) return mystrdup("(NULL)");
\r
429 if (flag_mode == FLAG_LONG) {
\r
430 ch[0] = (unsigned char) (f >> 8);
\r
431 ch[1] = (unsigned char) (f - ((f >> 8) << 8));
\r
433 } else if (flag_mode == FLAG_NUM) {
\r
434 sprintf((char *) ch, "%d", f);
\r
435 } else if (flag_mode == FLAG_UNI) {
\r
436 u16_u8((char *) &ch, 10, (w_char *) &f, 1);
\r
438 ch[0] = (unsigned char) (f);
\r
441 return mystrdup((char *) ch);
\r
444 // read in aff file and set flag mode
\r
445 int HashMgr::load_config(const char * affpath)
\r
450 char line[MAXDELEN+1];
\r
452 // open the affix file
\r
454 afflst = fopen(affpath,"r");
\r
456 HUNSPELL_WARNING(stderr, "Error - could not open affix description file %s\n",affpath);
\r
460 // read in each line ignoring any that do not
\r
461 // start with a known line type indicator
\r
463 while (fgets(line,MAXDELEN,afflst)) {
\r
466 /* remove byte order mark */
\r
469 if (strncmp(line,"",3) == 0) memmove(line, line+3, strlen(line+3)+1);
\r
472 /* parse in the try string */
\r
473 if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
\r
474 if (flag_mode != FLAG_CHAR) {
\r
475 HUNSPELL_WARNING(stderr, "error: duplicate FLAG parameter\n");
\r
477 if (strstr(line, "long")) flag_mode = FLAG_LONG;
\r
478 if (strstr(line, "num")) flag_mode = FLAG_NUM;
\r
479 if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
\r
480 if (flag_mode == FLAG_CHAR) {
\r
481 HUNSPELL_WARNING(stderr, "error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line);
\r
484 if ((strncmp(line,"SET",3) == 0) && isspace(line[3]) && strstr(line, "UTF-8")) utf8 = 1;
\r
486 /* parse in the ignored characters (for example, Arabic optional diacritics characters */
\r
487 if (strncmp(line,"IGNORE",6) == 0) {
\r
488 if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) {
\r
494 if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
\r
495 if (parse_aliasf(line, afflst)) {
\r
501 #ifdef HUNSPELL_EXPERIMENTAL
\r
502 if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
\r
503 if (parse_aliasm(line, afflst)) {
\r
509 if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
\r
510 if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
\r
516 /* parse in the ALIAS table */
\r
517 int HashMgr::parse_aliasf(char * line, FILE * af)
\r
519 if (numaliasf != 0) {
\r
520 HUNSPELL_WARNING(stderr, "error: duplicate AF (alias for flag vector) tables used\n");
\r
527 piece = mystrsep(&tp, 0);
\r
529 if (*piece != '\0') {
\r
531 case 0: { np++; break; }
\r
533 numaliasf = atoi(piece);
\r
534 if (numaliasf < 1) {
\r
538 HUNSPELL_WARNING(stderr, "incorrect number of entries in AF table\n");
\r
542 aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
\r
543 aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
\r
544 if (!aliasf || !aliasflen) {
\r
546 if (aliasf) free(aliasf);
\r
547 if (aliasflen) free(aliasflen);
\r
560 piece = mystrsep(&tp, 0);
\r
568 HUNSPELL_WARNING(stderr, "error: missing AF table information\n");
\r
572 /* now parse the numaliasf lines to read in the remainder of the table */
\r
574 for (int j=0; j < numaliasf; j++) {
\r
575 if (!fgets(nl,MAXDELEN,af)) return 1;
\r
581 piece = mystrsep(&tp, 0);
\r
583 if (*piece != '\0') {
\r
586 if (strncmp(piece,"AF",2) != 0) {
\r
592 HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n");
\r
599 aliasflen[j] = (unsigned short) decode_flags(&(aliasf[j]), piece);
\r
600 flag_qsort(aliasf[j], 0, aliasflen[j]);
\r
608 piece = mystrsep(&tp, 0);
\r
616 HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n");
\r
623 int HashMgr::is_aliasf() {
\r
624 return (aliasf != NULL);
\r
627 int HashMgr::get_aliasf(int index, unsigned short ** fvec) {
\r
628 if ((index > 0) && (index <= numaliasf)) {
\r
629 *fvec = aliasf[index - 1];
\r
630 return aliasflen[index - 1];
\r
632 HUNSPELL_WARNING(stderr, "error: bad flag alias index: %d\n", index);
\r
637 #ifdef HUNSPELL_EXPERIMENTAL
\r
638 /* parse morph alias definitions */
\r
639 int HashMgr::parse_aliasm(char * line, FILE * af)
\r
641 if (numaliasm != 0) {
\r
642 HUNSPELL_WARNING(stderr, "error: duplicate AM (aliases for morphological descriptions) tables used\n");
\r
649 piece = mystrsep(&tp, 0);
\r
651 if (*piece != '\0') {
\r
653 case 0: { np++; break; }
\r
655 numaliasm = atoi(piece);
\r
656 if (numaliasm < 1) {
\r
657 HUNSPELL_WARNING(stderr, "incorrect number of entries in AM table\n");
\r
661 aliasm = (char **) malloc(numaliasm * sizeof(char *));
\r
674 piece = mystrsep(&tp, 0);
\r
680 HUNSPELL_WARNING(stderr, "error: missing AM alias information\n");
\r
684 /* now parse the numaliasm lines to read in the remainder of the table */
\r
686 for (int j=0; j < numaliasm; j++) {
\r
687 if (!fgets(nl,MAXDELEN,af)) return 1;
\r
692 piece = mystrsep(&tp, 0);
\r
694 if (*piece != '\0') {
\r
697 if (strncmp(piece,"AM",2) != 0) {
\r
698 HUNSPELL_WARNING(stderr, "error: AM table is corrupt\n");
\r
708 if (complexprefixes) {
\r
709 if (utf8) reverseword_utf(piece);
\r
710 else reverseword(piece);
\r
712 aliasm[j] = mystrdup(piece);
\r
719 piece = mystrsep(&tp, 0);
\r
725 HUNSPELL_WARNING(stderr, "error: map table is corrupt\n");
\r
732 int HashMgr::is_aliasm() {
\r
733 return (aliasm != NULL);
\r
736 char * HashMgr::get_aliasm(int index) {
\r
737 if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
\r
738 HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
\r