6 #include "mythes.hxx"
\r
10 MyThes::MyThes(const char* idxpath, const char * datpath)
\r
17 if (thInitialize(idxpath, datpath) != 1) {
\r
18 fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath);
\r
20 if (encoding) free((void*)encoding);
\r
21 if (list) free((void*)list);
\r
22 if (offst) free((void*)offst);
\r
23 // did not initialize properly - throw exception?
\r
30 if (thCleanup() != 1) {
\r
31 /* did not cleanup properly - throw exception? */
\r
33 if (encoding) free((void*)encoding);
\r
40 int MyThes::thInitialize(const char* idxpath, const char* datpath)
\r
43 // open the index file
\r
44 FILE * pifile = fopen(idxpath,"r");
\r
50 // parse in encoding and index size */
\r
52 wrd = (char *)calloc(1, MAX_WD_LEN);
\r
53 int len = readLine(pifile,wrd,MAX_WD_LEN);
\r
54 encoding = mystrdup(wrd);
\r
55 len = readLine(pifile,wrd,MAX_WD_LEN);
\r
56 int idxsz = atoi(wrd);
\r
59 // now allocate list, offst for the given size
\r
60 list = (char**) calloc(idxsz,sizeof(char*));
\r
61 offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int));
\r
63 if ( (!(list)) || (!(offst)) ) {
\r
64 fprintf(stderr,"Error - bad memory allocation\n");
\r
69 // now parse the remaining lines of the index
\r
70 len = readLine(pifile,wrd,MAX_WD_LEN);
\r
73 int np = mystr_indexOfChar(wrd,'|');
\r
77 list[nw] = (char *)calloc(1,(np+1));
\r
78 memcpy((list[nw]),wrd,np);
\r
79 offst[nw] = atoi(wrd+np+1);
\r
83 len = readLine(pifile,wrd,MAX_WD_LEN);
\r
90 /* next open the data file */
\r
91 pdfile = fopen(datpath,"r");
\r
101 int MyThes::thCleanup()
\r
103 /* first close the data file */
\r
109 /* now free up all the allocated strings on the list */
\r
110 for (int i=0; i < nw; i++)
\r
118 if (list) free((void*)list);
\r
119 if (offst) free((void*)offst);
\r
127 // lookup text in index and count of meanings and a list of meaning entries
\r
128 // with each entry having a synonym count and pointer to an
\r
129 // array of char * (i.e the synonyms)
\r
131 // note: calling routine should call CleanUpAfterLookup with the original
\r
132 // meaning point and count to properly deallocate memory
\r
134 int MyThes::Lookup(const char * pText, int len, mentry** pme)
\r
139 // handle the case of missing file or file related errors
\r
140 if (! pdfile) return 0;
\r
144 /* copy search word and make sure null terminated */
\r
145 char * wrd = (char *) calloc(1,(len+1));
\r
146 memcpy(wrd,pText,len);
\r
148 /* find it in the list */
\r
149 int idx = binsearch(wrd,list,nw);
\r
151 if (idx < 0) return 0;
\r
153 // now seek to the offset
\r
154 offset = (long) offst[idx];
\r
155 int rc = fseek(pdfile,offset,SEEK_SET);
\r
160 // grab the count of the number of meanings
\r
161 // and allocate a list of meaning entries
\r
163 buf = (char *) malloc( MAX_LN_LEN );
\r
164 if (!buf) return 0;
\r
165 readLine(pdfile, buf, (MAX_LN_LEN-1));
\r
166 int np = mystr_indexOfChar(buf,'|');
\r
171 int nmeanings = atoi(buf+np+1);
\r
172 *pme = (mentry*) malloc( nmeanings * sizeof(mentry) );
\r
178 // now read in each meaning and parse it to get defn, count and synonym lists
\r
179 mentry* pm = *(pme);
\r
180 char dfn[MAX_WD_LEN];
\r
182 for (int j = 0; j < nmeanings; j++) {
\r
183 readLine(pdfile, buf, (MAX_LN_LEN-1));
\r
189 // store away the part of speech for later use
\r
192 np = mystr_indexOfChar(p,'|');
\r
198 pos = mystrdup("");
\r
201 // count the number of fields in the remaining line
\r
204 np = mystr_indexOfChar(d,'|');
\r
205 while ( np >= 0 ) {
\r
208 np = mystr_indexOfChar(d,'|');
\r
211 pm->psyns = (char **) malloc(nf*sizeof(char*));
\r
213 // fill in the synonym list
\r
215 for (int j = 0; j < nf; j++) {
\r
216 np = mystr_indexOfChar(d,'|');
\r
219 pm->psyns[j] = mystrdup(d);
\r
222 pm->psyns[j] = mystrdup(d);
\r
226 // add pos to first synonym to create the definition
\r
227 int k = strlen(pos);
\r
228 int m = strlen(pm->psyns[0]);
\r
229 if ((k+m) < (MAX_WD_LEN - 1)) {
\r
230 strncpy(dfn,pos,k);
\r
232 strncpy((dfn+k+1),(pm->psyns[0]),m+1);
\r
233 pm->defn = mystrdup(dfn);
\r
235 pm->defn = mystrdup(pm->psyns[0]);
\r
248 void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings)
\r
251 if (nmeanings == 0) return;
\r
252 if ((*pme) == NULL) return;
\r
254 mentry * pm = *pme;
\r
256 for (int i = 0; i < nmeanings; i++) {
\r
257 int count = pm->count;
\r
258 for (int j = 0; j < count; j++) {
\r
259 if (pm->psyns[j]) free(pm->psyns[j]);
\r
260 pm->psyns[j] = NULL;
\r
262 if (pm->psyns) free(pm->psyns);
\r
264 if (pm->defn) free(pm->defn);
\r
276 // read a line of text from a text file stripping
\r
277 // off the line terminator and replacing it with
\r
278 // a null string terminator.
\r
279 // returns: -1 on error or the number of characters in
\r
280 // in the returning string
\r
282 // A maximum of nc characters will be returned
\r
284 int MyThes::readLine(FILE * pf, char * buf, int nc)
\r
287 if (fgets(buf,nc,pf)) {
\r
289 return strlen(buf);
\r
296 // performs a binary search on null terminated character
\r
299 // returns: -1 on not found
\r
300 // index of wrd in the list[]
\r
302 int MyThes::binsearch(char * sw, char* list[], int nlst)
\r
304 int lp, up, mp, j, indx;
\r
308 if (strcmp(sw,list[lp]) < 0) return -1;
\r
309 if (strcmp(sw,list[up]) > 0) return -1;
\r
310 while (indx < 0 ) {
\r
311 mp = (int)((lp+up) >> 1);
\r
312 j = strcmp(sw,list[mp]);
\r
315 } else if (j < 0 ) {
\r
320 if (lp > up) return -1;
\r
325 char * MyThes::get_th_encoding()
\r
327 if (encoding) return encoding;
\r
332 // string duplication routine
\r
333 char * MyThes::mystrdup(const char * p)
\r
335 int sl = strlen(p) + 1;
\r
336 char * d = (char *)malloc(sl);
\r
344 // remove cross-platform text line end characters
\r
345 void MyThes::mychomp(char * s)
\r
348 if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
\r
349 if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
\r
353 // return index of char in string
\r
354 int MyThes::mystr_indexOfChar(const char * d, int c)
\r
356 char * p = strchr((char *)d,c);
\r
357 if (p) return (int)(p-d);
\r