\r
package cx.fbn.nevernote.threads;\r
\r
-import java.io.ByteArrayInputStream;\r
-import java.io.ByteArrayOutputStream;\r
import java.io.File;\r
import java.io.FileInputStream;\r
import java.io.FileNotFoundException;\r
import java.io.IOException;\r
import java.io.InputStream;\r
import java.util.List;\r
+import java.util.TreeSet;\r
import java.util.concurrent.LinkedBlockingQueue;\r
+import java.util.concurrent.locks.LockSupport;\r
\r
import org.apache.commons.lang.StringEscapeUtils;\r
import org.apache.tika.exception.TikaException;\r
import org.apache.tika.parser.pdf.PDFParser;\r
import org.apache.tika.parser.rtf.RTFParser;\r
import org.apache.tika.sax.BodyContentHandler;\r
-import org.w3c.tidy.Tidy;\r
import org.xml.sax.ContentHandler;\r
import org.xml.sax.SAXException;\r
\r
public boolean keepRunning;\r
private final QDomDocument doc;\r
private static String regex = Global.getWordRegex();\r
+ public String specialIndexCharacters = "";\r
+ public boolean indexNoteBody = true;\r
+ public boolean indexNoteTitle = true;\r
+ public boolean indexImageRecognition = true;\r
private final DatabaseConnection conn;\r
private volatile LinkedBlockingQueue<String> workQueue;\r
private static int MAX_QUEUED_WAITING = 1000;\r
public boolean interrupt;\r
public boolean idle;\r
+ public boolean indexAttachmentsLocally = true;\r
public volatile IndexSignal signal;\r
+ private final TreeSet<String> foundWords;\r
+ int uncommittedCount = 0;\r
\r
\r
- public IndexRunner(String logname, String u, String uid, String pswd, String cpswd) {\r
+ public IndexRunner(String logname, String u, String i, String r, String uid, String pswd, String cpswd) {\r
+ foundWords = new TreeSet<String>();\r
logger = new ApplicationLogger(logname);\r
- conn = new DatabaseConnection(logger, u, uid, pswd, cpswd);\r
+ conn = new DatabaseConnection(logger, u, i, r, uid, pswd, cpswd, 500);\r
indexType = SCAN;\r
guid = null;\r
keepRunning = true;\r
while (keepRunning) {\r
idle=true;\r
try {\r
+ conn.commitTransaction();\r
+ uncommittedCount = 0;\r
String work = workQueue.take();\r
idle=false;\r
if (work.startsWith("SCAN")) {\r
reindexNote();\r
}\r
} catch (InterruptedException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Thread interrupted exception: " +e.getMessage());\r
}\r
}\r
+ logger.log(logger.EXTREME, "Shutting down database");\r
conn.dbShutdown();\r
+ logger.log(logger.EXTREME, "Database shut down. Exiting thread");\r
}\r
\r
// Reindex a note\r
public void indexNoteContent() {\r
+ foundWords.clear();\r
\r
-// if (wordMap.size() > 0)\r
-// wordMap.clear();\r
logger.log(logger.EXTREME, "Entering indexRunner.indexNoteContent()");\r
\r
logger.log(logger.EXTREME, "Getting note content");\r
Note n = conn.getNoteTable().getNote(guid,true,false,true,true, true);\r
- String data = n.getContent();\r
+ String data;\r
+ if (indexNoteBody) {\r
+ data = n.getContent();\r
+ data = conn.getNoteTable().getNoteContentNoUTFConversion(n.getGuid());\r
\r
- logger.log(logger.EXTREME, "Removing any encrypted data");\r
- data = removeEnCrypt(data);\r
- logger.log(logger.EXTREME, "Removing xml markups");\r
- Tidy tidy = new Tidy();\r
- tidy.getStderr().close(); // the listener will capture messages\r
- tidy.setXmlTags(true);\r
- byte html[] = data.getBytes();\r
- ByteArrayInputStream is = new ByteArrayInputStream(html);\r
- ByteArrayOutputStream os = new ByteArrayOutputStream();\r
- tidy.parse(is, os);\r
- String text = StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", "")) +" "+\r
- n.getTitle();\r
+ logger.log(logger.EXTREME, "Removing any encrypted data");\r
+ data = removeEnCrypt(data.toString());\r
+ logger.log(logger.EXTREME, "Removing xml markups");\r
+ } else\r
+ data = "";\r
+ String text;\r
+ if (indexNoteTitle)\r
+ text = removeTags(StringEscapeUtils.unescapeHtml(data) +" "+ n.getTitle());\r
+ else\r
+ text = removeTags(StringEscapeUtils.unescapeHtml(data));\r
\r
logger.log(logger.EXTREME, "Splitting words");\r
String[] result = text.toString().split(regex);\r
+ conn.commitTransaction();\r
+ conn.beginTransaction();\r
logger.log(logger.EXTREME, "Deleting existing words for note from index");\r
conn.getWordsTable().expungeFromWordIndex(guid, "CONTENT");\r
\r
logger.log(logger.EXTREME, "Number of words found: " +result.length);\r
for (int j=0; j<result.length && keepRunning; j++) {\r
- logger.log(logger.EXTREME, "Result word: " +result[j]);\r
- addToIndex(guid, result[j], "CONTENT");\r
+ if (interrupt) {\r
+ processInterrupt();\r
+ }\r
+ if (!result[j].trim().equals("")) {\r
+ logger.log(logger.EXTREME, "Result word: " +result[j].trim());\r
+ addToIndex(guid, result[j], "CONTENT");\r
+ }\r
}\r
// If we were interrupted, we will reindex this note next time\r
if (Global.keepRunning) {\r
logger.log(logger.EXTREME, "Resetting note guid needed");\r
conn.getNoteTable().setIndexNeeded(guid, false);\r
- }\r
+ } \r
+ conn.commitTransaction();\r
+ uncommittedCount = 0;\r
logger.log(logger.EXTREME, "Leaving indexRunner.indexNoteContent()");\r
}\r
+ \r
+ \r
+ private String removeTags(String text) {\r
+ StringBuffer buffer = new StringBuffer(text);\r
+ boolean inTag = false;\r
+ for (int i=buffer.length()-1; i>=0; i--) {\r
+ if (buffer.charAt(i) == '>')\r
+ inTag = true;\r
+ if (buffer.charAt(i) == '<')\r
+ inTag = false;\r
+ if (inTag || buffer.charAt(i) == '<')\r
+ buffer.deleteCharAt(i);\r
+ }\r
+ \r
+ return buffer.toString();\r
+ }\r
\r
\r
public synchronized boolean addWork(String request) {\r
\r
if (guid == null)\r
return;\r
- \r
+ foundWords.clear();\r
Resource r = conn.getNoteTable().noteResourceTable.getNoteResourceRecognition(guid);\r
- if (r == null || r.getRecognition() == null || r.getRecognition().getBody() == null || r.getRecognition().getBody().length == 0) \r
+ if (!indexImageRecognition || \r
+ r == null || r.getRecognition() == null || \r
+ r.getRecognition().getBody() == null || \r
+ r.getRecognition().getBody().length == 0) \r
resourceBinary = new QByteArray(" ");\r
else\r
resourceBinary = new QByteArray(r.getRecognition().getBody());\r
\r
+ conn.commitTransaction();\r
+ conn.beginTransaction();\r
conn.getWordsTable().expungeFromWordIndex(r.getNoteGuid(), "RESOURCE");\r
// This is due to an old bug & can be removed at some point in the future 11/23/2010\r
conn.getWordsTable().expungeFromWordIndex(guid, "RESOURCE"); \r
+ conn.commitTransaction();\r
+ uncommittedCount = 0;\r
+ conn.beginTransaction();\r
\r
doc.setContent(resourceBinary);\r
QDomElement docElem = doc.documentElement();\r
// look for text tags\r
QDomNodeList anchors = docElem.elementsByTagName("t");\r
for (int i=0; i<anchors.length() && keepRunning; i++) {\r
+ if (interrupt) {\r
+ if (interrupt) {\r
+ processInterrupt();\r
+ }\r
+ }\r
QDomElement enmedia = anchors.at(i).toElement();\r
String weight = new String(enmedia.attribute("w"));\r
String text = new String(enmedia.text()).toLowerCase();\r
if (!text.equals("")) {\r
conn.getWordsTable().addWordToNoteIndex(r.getNoteGuid(), text, "RESOURCE", new Integer(weight));\r
+ uncommittedCount++;\r
+ if (uncommittedCount > 100) {\r
+ conn.commitTransaction();\r
+ uncommittedCount=0;\r
+ }\r
}\r
}\r
\r
- if (Global.keepRunning) {\r
+ if (Global.keepRunning && indexAttachmentsLocally) {\r
+ conn.commitTransaction();\r
+ uncommittedCount = 0;\r
+ conn.beginTransaction();\r
indexResourceContent(guid);\r
}\r
- \r
+ \r
if (Global.keepRunning)\r
conn.getNoteTable().noteResourceTable.setIndexNeeded(guid,false);\r
+ conn.commitTransaction();\r
+ uncommittedCount = 0;\r
}\r
\r
private void indexResourceContent(String guid) {\r
\r
private void indexResourceRTF(Resource r) {\r
\r
- QTemporaryFile f = writeResource(r.getData());\r
+ Data d = r.getData();\r
+ for (int i=0; i<20 && d.getSize() == 0; i++)\r
+ d = r.getData();\r
+ if (d.getSize()== 0)\r
+ return;\r
+\r
+ QTemporaryFile f = writeResource(d);\r
if (!keepRunning) {\r
return;\r
}\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
} catch (Exception e) {\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage());\r
}\r
}\r
\r
\r
private void indexResourceODF(Resource r) {\r
\r
- QTemporaryFile f = writeResource(r.getData());\r
+ Data d = r.getData();\r
+ for (int i=0; i<20 && d.getSize() == 0; i++)\r
+ d = r.getData();\r
+ if (d.getSize()== 0)\r
+ return;\r
+ QTemporaryFile f = writeResource(d);\r
if (!keepRunning) {\r
return;\r
}\r
parser.parse(input, textHandler, metadata, context);\r
String[] result = textHandler.toString().split(regex);\r
for (int i=0; i<result.length && keepRunning; i++) {\r
+ if (interrupt) {\r
+ processInterrupt();\r
+ }\r
addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
}\r
input.close();\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
} catch (Exception e) {\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage());\r
}\r
}\r
\r
\r
private void indexResourceOffice(Resource r) {\r
\r
- QTemporaryFile f = writeResource(r.getData());\r
+ Data d = r.getData();\r
+ for (int i=0; i<20 && d.getSize() == 0; i++)\r
+ d = r.getData();\r
+ if (d.getSize()== 0)\r
+ return;\r
+ QTemporaryFile f = writeResource(d);\r
if (!keepRunning) {\r
return;\r
}\r
parser.parse(input, textHandler, metadata, context);\r
String[] result = textHandler.toString().split(regex);\r
for (int i=0; i<result.length && keepRunning; i++) {\r
+ if (interrupt) {\r
+ processInterrupt();\r
+ }\r
addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
}\r
input.close();\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
} catch (Exception e) {\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage());\r
}\r
}\r
\r
\r
private void indexResourcePDF(Resource r) {\r
\r
- QTemporaryFile f = writeResource(r.getData());\r
+ Data d = r.getData();\r
+ for (int i=0; i<20 && d.getSize() == 0; i++)\r
+ d = r.getData();\r
+ if (d.getSize()== 0)\r
+ return;\r
+ QTemporaryFile f = writeResource(d);\r
if (!keepRunning) {\r
return;\r
}\r
parser.parse(input, textHandler, metadata, context);\r
String[] result = textHandler.toString().split(regex);\r
for (int i=0; i<result.length && keepRunning; i++) {\r
+ if (interrupt) {\r
+ processInterrupt();\r
+ }\r
addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
}\r
input.close();\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
} catch (Exception e) {\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage());\r
}\r
}\r
\r
\r
private void indexResourceOOXML(Resource r) {\r
\r
- QTemporaryFile f = writeResource(r.getData());\r
+ Data d = r.getData();\r
+ for (int i=0; i<20 && d.getSize() == 0; i++)\r
+ d = r.getData();\r
+ if (d.getSize()== 0)\r
+ return;\r
+ QTemporaryFile f = writeResource(d);\r
if (!keepRunning) {\r
return;\r
}\r
parser.parse(input, textHandler, metadata, context);\r
String[] result = textHandler.toString().split(regex);\r
for (int i=0; i<result.length && keepRunning; i++) {\r
+ if (interrupt) {\r
+ processInterrupt();\r
+ }\r
addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
}\r
input.close();\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
} catch (Exception e) {\r
- e.printStackTrace();\r
- }\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage()); }\r
}\r
\r
\r
newFile.write(d.getBody());\r
newFile.close();\r
return newFile;\r
- }\r
+ } \r
\r
\r
private String removeEnCrypt(String content) {\r
int endPos;\r
boolean tagFound = true;\r
while (tagFound && keepRunning) {\r
+ if (interrupt) {\r
+ processInterrupt();\r
+ }\r
endPos = content.indexOf("</en-crypt>", index)+11;\r
if (endPos > -1 && index > -1) {\r
content = content.substring(0,index)+content.substring(endPos);\r
\r
\r
private void addToIndex(String guid, String word, String type) {\r
- if (word.length() > 0) {\r
+ if (foundWords.contains(word))\r
+ return;\r
+ StringBuffer buffer = new StringBuffer(word.toLowerCase());\r
+ for (int i=buffer.length()-1; i>=0; i--) {\r
+ if (!Character.isLetterOrDigit(buffer.charAt(i)) && specialIndexCharacters.indexOf(buffer.charAt(i)) == -1)\r
+ buffer.deleteCharAt(i);\r
+ else\r
+ break;\r
+ }\r
+ buffer = buffer.reverse();\r
+ for (int i=buffer.length()-1; i>=0; i--) {\r
+ if (!Character.isLetterOrDigit(buffer.charAt(i)))\r
+ buffer.deleteCharAt(i);\r
+ else\r
+ break;\r
+ }\r
+ buffer = buffer.reverse();\r
+ if (buffer.length() > 0) {\r
// We have a good word, now let's trim off junk at the beginning or end\r
- StringBuffer buffer = new StringBuffer(word.toLowerCase());\r
- for (int x = buffer.length()-1; x>=0; x--) {\r
- if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
- buffer = buffer.deleteCharAt(x);\r
- else\r
- x=-1;\r
- }\r
- // Things have been trimmed off the end, so reverse the string & repeat.\r
- buffer = buffer.reverse();\r
- for (int x = buffer.length()-1; x>=0 && keepRunning; x--) {\r
- if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
- buffer = buffer.deleteCharAt(x);\r
- else\r
- x=-1;\r
- }\r
- // Restore the string back to the proper order.\r
- buffer = buffer.reverse();\r
- \r
- if (buffer.length()>=Global.minimumWordCount) {\r
+ if (!foundWords.contains(buffer.toString())) {\r
+ foundWords.add(buffer.toString());\r
+ foundWords.add(word);\r
conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), type, 100);\r
+ uncommittedCount++;\r
+ if (uncommittedCount > 100) {\r
+ conn.commitTransaction();\r
+ uncommittedCount=0;\r
+ }\r
}\r
}\r
return;\r
signal.indexStarted.emit();\r
started = true;\r
}\r
- for (int i=0; i<notes.size() && !interrupt && keepRunning; i++) {\r
+ for (int i=0; i<notes.size() && keepRunning; i++) {\r
+ if (interrupt) {\r
+ processInterrupt();\r
+ }\r
guid = notes.get(i);\r
if (guid != null && keepRunning) {\r
indexNoteContent();\r
}\r
\r
List<String> unindexedResources = conn.getNoteTable().noteResourceTable.getUnindexed();\r
- if (notes.size() > 0 && !started) {\r
+ if (unindexedResources.size() > 0 && !started) {\r
signal.indexStarted.emit();\r
started = true;\r
}\r
- for (int i=0; i>unindexedResources.size()&& !interrupt && keepRunning; i++) {\r
+ for (int i=0; i<unindexedResources.size()&& keepRunning; i++) {\r
+ if (interrupt) {\r
+ processInterrupt();\r
+ }\r
guid = unindexedResources.get(i);\r
if (keepRunning) {\r
indexResource();\r
}\r
}\r
- if (started && keepRunning && !interrupt) \r
+ if (started && keepRunning) \r
signal.indexFinished.emit();\r
}\r
\r
conn.getNoteTable().noteResourceTable.reindexAll(); \r
}\r
\r
+ private void waitSeconds(int len) {\r
+ long starttime = 0; // variable declared\r
+ //...\r
+ // for the first time, remember the timestamp\r
+ starttime = System.currentTimeMillis();\r
+ // the next timestamp we want to wake up\r
+ starttime += (1000.0);\r
+ // Wait until the desired next time arrives using nanosecond\r
+ // accuracy timer (wait(time) isn't accurate enough on most platforms) \r
+ LockSupport.parkNanos((Math.max(0, \r
+ starttime - System.currentTimeMillis()) * 1000000));\r
+ }\r
+ \r
+ private void processInterrupt() {\r
+ conn.commitTransaction();\r
+ waitSeconds(1);\r
+ uncommittedCount = 0;\r
+ conn.beginTransaction();\r
+ interrupt = false;\r
+ }\r
+ \r
}\r