2 Jazzy - a Java library for Spell Checking
\r
3 Copyright (C) 2001 Mindaugas Idzelis
\r
4 Full text of license can be found in LICENSE.txt
\r
6 This library is free software; you can redistribute it and/or
\r
7 modify it under the terms of the GNU Lesser General Public
\r
8 License as published by the Free Software Foundation; either
\r
9 version 2.1 of the License, or (at your option) any later version.
\r
11 This library is distributed in the hope that it will be useful,
\r
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
14 Lesser General Public License for more details.
\r
16 You should have received a copy of the GNU Lesser General Public
\r
17 License along with this library; if not, write to the Free Software
\r
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
\r
20 /* Created by bgalbs on Jan 30, 2003 at 11:38:39 PM */
\r
21 package com.swabunga.spell.engine;
\r
23 import java.io.BufferedOutputStream;
\r
24 import java.io.BufferedReader;
\r
25 import java.io.BufferedWriter;
\r
26 import java.io.File;
\r
27 import java.io.FileInputStream;
\r
28 import java.io.FileNotFoundException;
\r
29 import java.io.FileOutputStream;
\r
30 import java.io.FileReader;
\r
31 import java.io.FileWriter;
\r
32 import java.io.IOException;
\r
33 import java.io.InputStream;
\r
34 import java.util.ArrayList;
\r
35 import java.util.Collections;
\r
36 import java.util.HashMap;
\r
37 import java.util.List;
\r
38 import java.util.Map;
\r
39 import java.util.StringTokenizer;
\r
40 import java.util.Vector;
\r
43 * An implementation of <code>SpellDictionary</code> that doesn't cache any words in memory. Avoids the huge
\r
44 * footprint of <code>SpellDictionaryHashMap</code> at the cost of relatively minor latency. A future version
\r
45 * of this class that implements some caching strategies might be a good idea in the future, if there's any
\r
48 * This class makes use of the "classic" Java IO library (java.io). However, it could probably benefit from
\r
49 * the new IO APIs (java.nio) and it is anticipated that a future version of this class, probably called
\r
50 * <code>SpellDictionaryDiskNIO</code> will appear at some point.
\r
52 * @author Ben Galbraith (ben@galbraiths.org)
\r
56 public class SpellDictionaryDisk extends SpellDictionaryASpell {
\r
57 private final static String DIRECTORY_WORDS = "words";
\r
58 private final static String DIRECTORY_DB = "db";
\r
59 private final static String FILE_CONTENTS = "contents";
\r
60 private final static String FILE_DB = "words.db";
\r
61 private final static String FILE_INDEX = "words.idx";
\r
63 /* maximum number of words an index entry can represent */
\r
64 private final static int INDEX_SIZE_MAX = 200;
\r
66 private final File base;
\r
67 private final File words;
\r
68 private final File db;
\r
69 @SuppressWarnings("unchecked")
\r
72 * The flag indicating if the initial preparation or loading of the on
\r
73 * disk dictionary is complete.
\r
75 protected boolean ready;
\r
77 /* used at time of creation of index to speed up determining the number of words per index entry */
\r
78 @SuppressWarnings("unchecked")
\r
79 private List indexCodeCache = null;
\r
82 * Construct a spell dictionary on disk.
\r
83 * The spell dictionary is created from words list(s) contained in file(s).
\r
84 * A words list file is a file with one word per line. Words list files are
\r
85 * located in a <code>base/words</code> dictionary where <code>base</code>
\r
86 * is the path to <code>words</code> dictionary. The on disk spell
\r
87 * dictionary is created in <code>base/db</code> dictionary and contains
\r
90 * <li><code>contents</code> list the words files used for spelling.</li>
\r
91 * <li><code>words.db</code> the content of words files organized as
\r
92 * a <em>database</em> of words.</li>
\r
93 * <li><code>words.idx</code> an index file to the <code>words.db</code>
\r
94 * file content.</li>
\r
96 * The <code>contents</code> file has a list of
\r
97 * <code>filename, size</code> indicating the name and length of each files
\r
98 * in the <code>base/words</code> dictionary. If one of theses files was
\r
99 * changed, added or deleted before the call to the constructor, the process
\r
100 * of producing new or updated <code>words.db</code> and
\r
101 * <code>words.idx</code> files is started again.
\r
103 * The spellchecking process is then worked upon the <code>words.db</code>
\r
104 * and <code>words.idx</code> files.
\r
107 * NOTE: Do *not* create two instances of this class pointing to the same <code>base</code> unless
\r
108 * you are sure that a new dictionary does not have to be created. In the future, some sort of
\r
109 * external locking mechanism may be created that handles this scenario gracefully.
\r
111 * @param base the base directory in which <code>SpellDictionaryDisk</code> can expect to find
\r
112 * its necessary files.
\r
113 * @param phonetic the phonetic file used by the spellchecker.
\r
114 * @param block if a new word db needs to be created, there can be a considerable delay before
\r
115 * the constructor returns. If block is true, this method will block while the db is created
\r
116 * and return when done. If block is false, this method will create a thread to create the new
\r
117 * dictionary and return immediately.
\r
118 * @throws java.io.FileNotFoundException indicates problems locating the
\r
119 * files on the system
\r
120 * @throws java.io.IOException indicates problems reading the files
\r
122 public SpellDictionaryDisk(File base, File phonetic, boolean block) throws FileNotFoundException, IOException {
\r
124 this.ready = false;
\r
127 this.words = new File(base, DIRECTORY_WORDS);
\r
128 this.db = new File(base, DIRECTORY_DB);
\r
130 if (!this.base.exists()) throw new FileNotFoundException("Couldn't find required path '" + this.base + "'");
\r
131 if (!this.words.exists()) throw new FileNotFoundException("Couldn't find required path '" + this.words + "'");
\r
132 if (!this.db.exists()) db.mkdirs();
\r
134 if (newDictionaryFiles()) {
\r
136 buildNewDictionaryDatabase();
\r
140 Thread t = new Thread() {
\r
142 public void run() {
\r
144 buildNewDictionaryDatabase();
\r
147 } catch (Exception e) {
\r
148 e.printStackTrace();
\r
161 * Builds the file words database file and the contents file for the on
\r
164 protected void buildNewDictionaryDatabase() throws FileNotFoundException, IOException {
\r
165 /* combine all dictionary files into one sorted file */
\r
166 File sortedFile = buildSortedFile();
\r
168 /* create the db for the sorted file */
\r
169 buildCodeDb(sortedFile);
\r
170 sortedFile.delete();
\r
172 /* build contents file */
\r
173 buildContentsFile();
\r
177 * Adds another word to the dictionary. <em>This method is not yet implemented
\r
178 * for this class</em>.
\r
179 * @param word The word to add.
\r
181 public void addWord(String word) {
\r
182 throw new UnsupportedOperationException("addWord not yet implemented (sorry)");
\r
186 * Returns a list of words that have the same phonetic code.
\r
187 * @param code The phonetic code common to the list of words
\r
188 * @return A list of words having the same phonetic code
\r
191 @SuppressWarnings("unchecked")
\r
192 public List getWords(String code) {
\r
193 Vector words = new Vector();
\r
195 int[] posLen = getStartPosAndLen(code);
\r
196 if (posLen != null) {
\r
198 InputStream input = new FileInputStream(new File(db, FILE_DB));
\r
199 input.skip(posLen[0]);
\r
200 byte[] bytes = new byte[posLen[1]];
\r
201 input.read(bytes, 0, posLen[1]);
\r
204 String data = new String(bytes);
\r
205 String[] lines = split(data, "\n");
\r
206 for (String line : lines) {
\r
207 String[] s = split(line, ",");
\r
208 if (s[0].equals(code)) words.addElement(s[1]);
\r
210 } catch (Exception e) {
\r
211 e.printStackTrace();
\r
219 * Indicates if the initial preparation or loading of the on disk dictionary
\r
221 * @return the indication that the dictionary initial setup is done.
\r
223 public boolean isReady() {
\r
227 @SuppressWarnings("unchecked")
\r
228 private boolean newDictionaryFiles() throws FileNotFoundException, IOException {
\r
229 /* load in contents file, which indicates the files and sizes of the last db build */
\r
230 List contents = new ArrayList();
\r
231 File c = new File(db, FILE_CONTENTS);
\r
233 BufferedReader reader = null;
\r
235 reader = new BufferedReader(new FileReader(c));
\r
237 while ((line = reader.readLine()) != null) {
\r
238 // format of file should be [filename],[size]
\r
239 String[] s = split(line, ",");
\r
240 contents.add(new FileSize(s[0], Integer.parseInt(s[1])));
\r
242 } catch (FileNotFoundException e) {
\r
244 } catch (IOException e) {
\r
247 if (reader != null) reader.close();
\r
251 /* compare this to the actual directory */
\r
252 boolean changed = false;
\r
253 File[] wordFiles = words.listFiles();
\r
254 if (contents.size() != wordFiles.length) {
\r
255 // if the size of the contents list and the number of word files are different, it
\r
256 // means we've definitely got to reindex
\r
259 // check and make sure that all the word files haven't changed on us
\r
260 for (File wordFile : wordFiles) {
\r
261 FileSize fs = new FileSize(wordFile.getName(), wordFile.length());
\r
262 if (!contents.contains(fs)) {
\r
272 @SuppressWarnings("unchecked")
\r
273 private File buildSortedFile() throws FileNotFoundException, IOException {
\r
274 List w = new ArrayList();
\r
277 * read every single word into the list. eeek. if this causes problems,
\r
278 * we may wish to explore disk-based sorting or more efficient memory-based storage
\r
280 File[] wordFiles = words.listFiles();
\r
281 for (File wordFile : wordFiles) {
\r
282 BufferedReader r = new BufferedReader(new FileReader(wordFile));
\r
284 while ((word = r.readLine()) != null) {
\r
285 if (!word.equals("")) {
\r
286 w.add(word.trim());
\r
292 Collections.sort(w);
\r
294 // FIXME - error handling for running out of disk space would be nice.
\r
295 File file = File.createTempFile("jazzy", "sorted");
\r
296 BufferedWriter writer = new BufferedWriter(new FileWriter(file));
\r
297 String prev = null;
\r
298 for (int i = 0; i < w.size(); i++) {
\r
299 String word = (String) w.get(i);
\r
300 if (prev == null || !prev.equals(word)) {
\r
301 writer.write(word);
\r
311 @SuppressWarnings("unchecked")
\r
312 private void buildCodeDb(File sortedWords) throws FileNotFoundException, IOException {
\r
313 List codeList = new ArrayList();
\r
315 BufferedReader reader = new BufferedReader(new FileReader(sortedWords));
\r
317 while ((word = reader.readLine()) != null) {
\r
318 codeList.add(new CodeWord(this.getCode(word), word));
\r
322 Collections.sort(codeList);
\r
324 List index = new ArrayList();
\r
326 BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(new File(db, FILE_DB)));
\r
327 String currentCode = null;
\r
328 int currentPosition = 0;
\r
329 int currentLength = 0;
\r
330 for (int i = 0; i < codeList.size(); i++) {
\r
331 CodeWord cw = (CodeWord) codeList.get(i);
\r
332 String thisCode = cw.getCode();
\r
333 // if (thisCode.length() > 3) thisCode = thisCode.substring(0, 3);
\r
334 thisCode = getIndexCode(thisCode, codeList);
\r
335 String toWrite = cw.getCode() + "," + cw.getWord() + "\n";
\r
336 byte[] bytes = toWrite.getBytes();
\r
338 if (currentCode == null) currentCode = thisCode;
\r
339 if (!currentCode.equals(thisCode)) {
\r
340 index.add(new Object[]{currentCode, new int[]{currentPosition, currentLength}});
\r
341 currentPosition += currentLength;
\r
342 currentLength = bytes.length;
\r
343 currentCode = thisCode;
\r
345 currentLength += bytes.length;
\r
351 // Output the last iteration
\r
352 if (currentCode != null && currentPosition != 0 && currentLength != 0)
\r
353 index.add(new Object[]{currentCode, new int[]{currentPosition, currentLength}});
\r
355 BufferedWriter writer = new BufferedWriter(new FileWriter(new File(db, FILE_INDEX)));
\r
356 for (int i = 0; i < index.size(); i++) {
\r
357 Object[] o = (Object[]) index.get(i);
\r
358 writer.write(o[0].toString());
\r
360 writer.write(String.valueOf(((int[]) o[1])[0]));
\r
362 writer.write(String.valueOf(((int[]) o[1])[1]));
\r
368 private void buildContentsFile() throws IOException {
\r
369 File[] wordFiles = words.listFiles();
\r
370 if (wordFiles.length > 0) {
\r
371 BufferedWriter writer = new BufferedWriter(new FileWriter(new File(db, FILE_CONTENTS)));
\r
372 for (File wordFile : wordFiles) {
\r
373 writer.write(wordFile.getName());
\r
375 writer.write(String.valueOf(wordFile.length()));
\r
380 new File(db, FILE_CONTENTS).delete();
\r
385 * Loads the index file from disk. The index file accelerates words lookup
\r
386 * into the dictionary db file.
\r
388 @SuppressWarnings("unchecked")
\r
389 protected void loadIndex() throws IOException {
\r
390 index = new HashMap();
\r
391 File idx = new File(db, FILE_INDEX);
\r
392 BufferedReader reader = new BufferedReader(new FileReader(idx));
\r
394 while ((line = reader.readLine()) != null) {
\r
395 String[] fields = split(line, ",");
\r
396 index.put(fields[0], new int[]{Integer.parseInt(fields[1]), Integer.parseInt(fields[2])});
\r
401 private int[] getStartPosAndLen(String code) {
\r
402 while (code.length() > 0) {
\r
403 int[] posLen = (int[]) index.get(code);
\r
404 if (posLen == null) {
\r
405 code = code.substring(0, code.length() - 1);
\r
413 @SuppressWarnings("unchecked")
\r
414 private String getIndexCode(String code, List codes) {
\r
415 if (indexCodeCache == null) indexCodeCache = new ArrayList();
\r
417 if (code.length() <= 1) return code;
\r
419 for (int i = 0; i < indexCodeCache.size(); i++) {
\r
420 String c = (String) indexCodeCache.get(i);
\r
421 if (code.startsWith(c)) return c;
\r
424 int foundSize = -1;
\r
425 boolean cacheable = false;
\r
426 for (int z = 1; z < code.length(); z++) {
\r
427 String thisCode = code.substring(0, z);
\r
429 for (int i = 0; i < codes.size();) {
\r
431 i = Collections.binarySearch(codes, new CodeWord(thisCode, ""));
\r
435 CodeWord cw = (CodeWord) codes.get(i);
\r
436 if (cw.getCode().startsWith(thisCode)) {
\r
438 if (count > INDEX_SIZE_MAX) break;
\r
439 } else if (cw.getCode().compareTo(thisCode) > 0) break;
\r
442 if (count <= INDEX_SIZE_MAX) {
\r
449 String newCode = (foundSize == -1) ? code : code.substring(0, foundSize);
\r
450 if (cacheable) indexCodeCache.add(newCode);
\r
454 private static String[] split(String input, String delimiter) {
\r
455 StringTokenizer st = new StringTokenizer(input, delimiter);
\r
456 int count = st.countTokens();
\r
457 String[] out = new String[count];
\r
459 for (int i = 0; i < count; i++) {
\r
460 out[i] = st.nextToken();
\r
466 @SuppressWarnings("unchecked")
\r
467 private class CodeWord implements Comparable {
\r
468 private final String code;
\r
469 private final String word;
\r
471 public CodeWord(String code, String word) {
\r
476 public String getCode() {
\r
480 public String getWord() {
\r
485 public boolean equals(Object o) {
\r
486 if (this == o) return true;
\r
487 if (!(o instanceof CodeWord)) return false;
\r
489 final CodeWord codeWord = (CodeWord) o;
\r
491 if (!word.equals(codeWord.word)) return false;
\r
497 public int hashCode() {
\r
498 return word.hashCode();
\r
501 public int compareTo(Object o) {
\r
502 return code.compareTo(((CodeWord) o).getCode());
\r
506 private class FileSize {
\r
507 private final String filename;
\r
508 private final long size;
\r
510 public FileSize(String filename, long size) {
\r
511 this.filename = filename;
\r
515 @SuppressWarnings("unused")
\r
516 public String getFilename() {
\r
520 @SuppressWarnings("unused")
\r
521 public long getSize() {
\r
526 public boolean equals(Object o) {
\r
527 if (this == o) return true;
\r
528 if (!(o instanceof FileSize)) return false;
\r
530 final FileSize fileSize = (FileSize) o;
\r
532 if (size != fileSize.size) return false;
\r
533 if (!filename.equals(fileSize.filename)) return false;
\r
539 public int hashCode() {
\r
541 result = filename.hashCode();
\r
542 result = (int) (29 * result + size);
\r