\r
package cx.fbn.nevernote.threads;\r
\r
-import java.io.ByteArrayInputStream;\r
-import java.io.ByteArrayOutputStream;\r
import java.io.File;\r
import java.io.FileInputStream;\r
import java.io.FileNotFoundException;\r
import java.io.IOException;\r
import java.io.InputStream;\r
+import java.util.List;\r
+import java.util.TreeSet;\r
import java.util.concurrent.LinkedBlockingQueue;\r
\r
import org.apache.commons.lang.StringEscapeUtils;\r
import org.apache.tika.parser.pdf.PDFParser;\r
import org.apache.tika.parser.rtf.RTFParser;\r
import org.apache.tika.sax.BodyContentHandler;\r
-import org.w3c.tidy.Tidy;\r
import org.xml.sax.ContentHandler;\r
import org.xml.sax.SAXException;\r
\r
import com.trolltech.qt.xml.QDomNodeList;\r
\r
import cx.fbn.nevernote.Global;\r
+import cx.fbn.nevernote.signals.IndexSignal;\r
import cx.fbn.nevernote.signals.NoteResourceSignal;\r
import cx.fbn.nevernote.signals.NoteSignal;\r
import cx.fbn.nevernote.sql.DatabaseConnection;\r
public volatile NoteSignal noteSignal;\r
public volatile NoteResourceSignal resourceSignal;\r
private int indexType;\r
- public final int CONTENT=1; \r
- public final int RESOURCE=2;\r
+ public final int SCAN=1; \r
+ public final int REINDEXALL=2;\r
+ public final int REINDEXNOTE=3;\r
public boolean keepRunning;\r
private final QDomDocument doc;\r
private static String regex = Global.getWordRegex();\r
private final DatabaseConnection conn;\r
private volatile LinkedBlockingQueue<String> workQueue;\r
private static int MAX_QUEUED_WAITING = 1000;\r
-\r
- \r
+ public boolean interrupt;\r
+ public boolean idle;\r
+ public boolean indexAttachmentsLocally = true;\r
+ public volatile IndexSignal signal;\r
+ private final TreeSet<String> foundWords;\r
\r
\r
public IndexRunner(String logname, String u, String uid, String pswd, String cpswd) {\r
+ foundWords = new TreeSet<String>();\r
logger = new ApplicationLogger(logname);\r
conn = new DatabaseConnection(logger, u, uid, pswd, cpswd);\r
- noteSignal = new NoteSignal();\r
- resourceSignal = new NoteResourceSignal();\r
- indexType = CONTENT;\r
+ indexType = SCAN;\r
guid = null;\r
keepRunning = true;\r
doc = new QDomDocument();\r
@Override\r
public void run() {\r
thread().setPriority(Thread.MIN_PRIORITY);\r
+ noteSignal = new NoteSignal();\r
+ resourceSignal = new NoteResourceSignal();\r
+ signal = new IndexSignal();\r
logger.log(logger.EXTREME, "Starting index thread ");\r
while (keepRunning) {\r
+ idle=true;\r
try {\r
+ //waitSeconds(1);\r
String work = workQueue.take();\r
- if (work.startsWith("CONTENT")) {\r
- work = work.replace("CONTENT ", "");\r
- guid = work;\r
- indexType = CONTENT;\r
+ idle=false;\r
+ if (work.startsWith("SCAN")) {\r
+ guid=null;\r
+ interrupt = false;\r
+ indexType = SCAN;\r
+ }\r
+ if (work.startsWith("REINDEXALL")) {\r
+ guid = null;\r
+ indexType=REINDEXALL;\r
}\r
- if (work.startsWith("RESOURCE")) {\r
- work = work.replace("RESOURCE ", "");\r
+ if (work.startsWith("REINDEXNOTE")) {\r
+ work = work.replace("REINDEXNOTE ", "");\r
guid = work;\r
- indexType = RESOURCE;\r
+ indexType = REINDEXNOTE;\r
}\r
if (work.startsWith("STOP")) {\r
keepRunning = false;\r
- guid = work;\r
- }\r
- if (guid == null || guid.trim().equals("")) {\r
- setIndexType(0);\r
- resourceSignal.resourceIndexed.emit("null or empty guid");\r
+ guid = null;\r
}\r
logger.log(logger.EXTREME, "Type:" +indexType);\r
- if (indexType == CONTENT && keepRunning) {\r
- logger.log(logger.MEDIUM, "Indexing note: "+guid);\r
- indexNoteContent();\r
+ if (indexType == SCAN && keepRunning) {\r
+ logger.log(logger.MEDIUM, "Scanning for unindexed notes & resources");\r
+ scanUnindexed();\r
setIndexType(0);\r
}\r
- if (indexType == RESOURCE && keepRunning) {\r
- logger.log(logger.MEDIUM, "Indexing resource: "+guid);\r
- indexResource();\r
+ if (indexType == REINDEXALL && keepRunning) {\r
+ logger.log(logger.MEDIUM, "Marking all for reindex");\r
+ reindexAll();\r
setIndexType(0);\r
}\r
+ if (indexType == REINDEXNOTE && keepRunning) {\r
+ reindexNote();\r
+ }\r
} catch (InterruptedException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Thread interrupted exception: " +e.getMessage());\r
}\r
}\r
+ logger.log(logger.EXTREME, "Shutting down database");\r
conn.dbShutdown();\r
+ logger.log(logger.EXTREME, "Database shut down. Exiting thread");\r
}\r
\r
// Reindex a note\r
public void indexNoteContent() {\r
+ foundWords.clear();\r
+ \r
logger.log(logger.EXTREME, "Entering indexRunner.indexNoteContent()");\r
\r
logger.log(logger.EXTREME, "Getting note content");\r
Note n = conn.getNoteTable().getNote(guid,true,false,true,true, true);\r
String data = n.getContent();\r
+ data = conn.getNoteTable().getNoteContentNoUTFConversion(n.getGuid());\r
\r
logger.log(logger.EXTREME, "Removing any encrypted data");\r
- data = removeEnCrypt(data);\r
+ data = removeEnCrypt(data.toString());\r
logger.log(logger.EXTREME, "Removing xml markups");\r
- // These HTML characters need to be replaced by a space, or they'll cause words to jam together\r
-// data = data.toLowerCase().replace("<br>", " ").replace("<hr>", " ").replace("<p>", " ").replace("<href>", " ");\r
-// String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", ""));\r
- Tidy tidy = new Tidy();\r
- tidy.getStderr().close(); // the listener will capture messages\r
- tidy.setXmlTags(true);\r
- byte html[] = data.getBytes();\r
- ByteArrayInputStream is = new ByteArrayInputStream(html);\r
- ByteArrayOutputStream os = new ByteArrayOutputStream();\r
- tidy.parse(is, os);\r
- String text = StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", "")) +" "+\r
- n.getTitle();\r
+ String text = removeTags(StringEscapeUtils.unescapeHtml(data) +" "+\r
+ n.getTitle());\r
\r
logger.log(logger.EXTREME, "Splitting words");\r
String[] result = text.toString().split(regex);\r
\r
logger.log(logger.EXTREME, "Number of words found: " +result.length);\r
for (int j=0; j<result.length && keepRunning; j++) {\r
- logger.log(logger.EXTREME, "Result word: " +result[j]);\r
- addToIndex(guid, result[j], "CONTENT");\r
+ if (!result[j].trim().equals("")) {\r
+ logger.log(logger.EXTREME, "Result word: " +result[j].trim());\r
+ addToIndex(guid, result[j], "CONTENT");\r
+ }\r
}\r
// If we were interrupted, we will reindex this note next time\r
if (Global.keepRunning) {\r
}\r
logger.log(logger.EXTREME, "Leaving indexRunner.indexNoteContent()");\r
}\r
+ \r
+ \r
+ private String removeTags(String text) {\r
+ StringBuffer buffer = new StringBuffer(text);\r
+ boolean inTag = false;\r
+ for (int i=buffer.length()-1; i>=0; i--) {\r
+ if (buffer.charAt(i) == '>')\r
+ inTag = true;\r
+ if (buffer.charAt(i) == '<')\r
+ inTag = false;\r
+ if (inTag || buffer.charAt(i) == '<')\r
+ buffer.deleteCharAt(i);\r
+ }\r
+ \r
+ return buffer.toString();\r
+ }\r
\r
\r
public synchronized boolean addWork(String request) {\r
\r
if (guid == null)\r
return;\r
- \r
+ foundWords.clear();\r
Resource r = conn.getNoteTable().noteResourceTable.getNoteResourceRecognition(guid);\r
if (r == null || r.getRecognition() == null || r.getRecognition().getBody() == null || r.getRecognition().getBody().length == 0) \r
resourceBinary = new QByteArray(" ");\r
}\r
}\r
\r
- if (Global.keepRunning) {\r
+ if (Global.keepRunning && indexAttachmentsLocally) {\r
indexResourceContent(guid);\r
}\r
- \r
+ \r
if (Global.keepRunning)\r
conn.getNoteTable().noteResourceTable.setIndexNeeded(guid,false);\r
}\r
\r
\r
private void indexResourceRTF(Resource r) {\r
+\r
QTemporaryFile f = writeResource(r.getData());\r
if (!keepRunning) {\r
return;\r
InputStream input;\r
try {\r
input = new FileInputStream(new File(f.fileName()));\r
- ContentHandler textHandler = new BodyContentHandler();\r
+ ContentHandler textHandler = new BodyContentHandler(-1);\r
Metadata metadata = new Metadata();\r
RTFParser parser = new RTFParser(); \r
ParseContext context = new ParseContext();\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
+ } catch (Exception e) {\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage());\r
}\r
}\r
\r
\r
private void indexResourceODF(Resource r) {\r
+\r
QTemporaryFile f = writeResource(r.getData());\r
if (!keepRunning) {\r
return;\r
InputStream input;\r
try {\r
input = new FileInputStream(new File(f.fileName()));\r
- ContentHandler textHandler = new BodyContentHandler();\r
+ ContentHandler textHandler = new BodyContentHandler(-1);\r
Metadata metadata = new Metadata();\r
OpenDocumentParser parser = new OpenDocumentParser(); \r
ParseContext context = new ParseContext();\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
+ } catch (Exception e) {\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage());\r
}\r
}\r
\r
\r
private void indexResourceOffice(Resource r) {\r
+\r
QTemporaryFile f = writeResource(r.getData());\r
if (!keepRunning) {\r
return;\r
InputStream input;\r
try {\r
input = new FileInputStream(new File(f.fileName()));\r
- ContentHandler textHandler = new BodyContentHandler();\r
+ ContentHandler textHandler = new BodyContentHandler(-1);\r
Metadata metadata = new Metadata();\r
OfficeParser parser = new OfficeParser(); \r
ParseContext context = new ParseContext();\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
+ } catch (Exception e) {\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage());\r
}\r
}\r
\r
\r
\r
private void indexResourcePDF(Resource r) {\r
+\r
QTemporaryFile f = writeResource(r.getData());\r
if (!keepRunning) {\r
return;\r
}\r
\r
InputStream input;\r
- try {\r
+ try { \r
input = new FileInputStream(new File(f.fileName()));\r
- ContentHandler textHandler = new BodyContentHandler();\r
+ ContentHandler textHandler = new BodyContentHandler(-1);\r
Metadata metadata = new Metadata();\r
PDFParser parser = new PDFParser(); \r
ParseContext context = new ParseContext();\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
+ } catch (Exception e) {\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage());\r
}\r
}\r
\r
\r
private void indexResourceOOXML(Resource r) {\r
+\r
QTemporaryFile f = writeResource(r.getData());\r
if (!keepRunning) {\r
return;\r
InputStream input;\r
try {\r
input = new FileInputStream(new File(f.fileName()));\r
- ContentHandler textHandler = new BodyContentHandler();\r
+ ContentHandler textHandler = new BodyContentHandler(-1);\r
Metadata metadata = new Metadata();\r
OOXMLParser parser = new OOXMLParser(); \r
ParseContext context = new ParseContext();\r
} catch (java.lang.ClassCastException e) {\r
logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
} catch (FileNotFoundException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "FileNotFound exception: " +e.getMessage());\r
} catch (IOException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "IO exception: " +e.getMessage());\r
} catch (SAXException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "SAX exception: " +e.getMessage());\r
} catch (TikaException e) {\r
- // TODO Auto-generated catch block\r
- e.printStackTrace();\r
+ logger.log(logger.LOW, "Tika exception: " +e.getMessage());\r
+ } catch (Exception e) {\r
+ logger.log(logger.LOW, "Unknown exception: " +e.getMessage());\r
+ } catch (java.lang.NoSuchMethodError e) {\r
+ logger.log(logger.LOW, "NoSuchMethod error: " +e.getMessage());\r
+ } catch (Error e) {\r
+ logger.log(logger.LOW, "Unknown error: " +e.getMessage());\r
}\r
}\r
\r
newFile.write(d.getBody());\r
newFile.close();\r
return newFile;\r
- }\r
+ } \r
\r
\r
private String removeEnCrypt(String content) {\r
\r
\r
private void addToIndex(String guid, String word, String type) {\r
- if (word.length() > 0) {\r
+ if (foundWords.contains(word))\r
+ return;\r
+ StringBuffer buffer = new StringBuffer(word.toLowerCase());\r
+ for (int i=buffer.length()-1; i>=0; i--) {\r
+ if (!Character.isLetterOrDigit(buffer.charAt(i)))\r
+ buffer.deleteCharAt(i);\r
+ else\r
+ break;\r
+ }\r
+ buffer = buffer.reverse();\r
+ for (int i=buffer.length()-1; i>=0; i--) {\r
+ if (!Character.isLetterOrDigit(buffer.charAt(i)))\r
+ buffer.deleteCharAt(i);\r
+ else\r
+ break;\r
+ }\r
+ buffer = buffer.reverse();\r
+ if (buffer.length() > 0) {\r
// We have a good word, now let's trim off junk at the beginning or end\r
- StringBuffer buffer = new StringBuffer(word.toLowerCase());\r
- for (int x = buffer.length()-1; x>=0; x--) {\r
- if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
- buffer = buffer.deleteCharAt(x);\r
- else\r
- x=-1;\r
+ if (!foundWords.contains(buffer.toString())) {\r
+ foundWords.add(buffer.toString());\r
+ foundWords.add(word);\r
+ conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), type, 100);\r
}\r
- // Things have been trimmed off the end, so reverse the string & repeat.\r
- buffer = buffer.reverse();\r
- for (int x = buffer.length()-1; x>=0; x--) {\r
- if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
- buffer = buffer.deleteCharAt(x);\r
- else\r
- x=-1;\r
+ }\r
+ return;\r
+ }\r
+ \r
+ private void scanUnindexed() {\r
+ List<String> notes = conn.getNoteTable().getUnindexed();\r
+ guid = null;\r
+ boolean started = false;\r
+ if (notes.size() > 0) {\r
+ signal.indexStarted.emit();\r
+ started = true;\r
+ }\r
+ for (int i=0; i<notes.size() && !interrupt && keepRunning; i++) {\r
+ guid = notes.get(i);\r
+ if (guid != null && keepRunning) {\r
+ //waitSeconds(1);\r
+ indexNoteContent();\r
}\r
- // Restore the string back to the proper order.\r
- buffer = buffer.reverse();\r
+ }\r
\r
- logger.log(logger.EXTREME, "Processing " +buffer);\r
- if (buffer.length()>=Global.minimumWordCount) {\r
- logger.log(logger.EXTREME, "Adding " +buffer);\r
- conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), type, 100);\r
+ List<String> unindexedResources = conn.getNoteTable().noteResourceTable.getUnindexed();\r
+ if (unindexedResources.size() > 0 && !started) {\r
+ signal.indexStarted.emit();\r
+ started = true;\r
+ }\r
+ for (int i=0; i<unindexedResources.size()&& !interrupt && keepRunning; i++) {\r
+ guid = unindexedResources.get(i);\r
+ if (keepRunning) {\r
+ //waitSeconds(1);\r
+ indexResource();\r
}\r
}\r
+ if (started && keepRunning && !interrupt) \r
+ signal.indexFinished.emit();\r
}\r
\r
+ private void reindexNote() {\r
+ if (guid == null)\r
+ return;\r
+ conn.getNoteTable().setIndexNeeded(guid, true);\r
+ }\r
+ \r
+ private void reindexAll() {\r
+ conn.getNoteTable().reindexAllNotes();\r
+ conn.getNoteTable().noteResourceTable.reindexAll(); \r
+ }\r
\r
+// private void waitSeconds(int len) {\r
+// QDateTime currentdate = new QDateTime(QDateTime.currentDateTime());\r
+// QDateTime futuredate = new QDateTime(QDateTime.currentDateTime());\r
+// \r
+// while (keepRunning && (futuredate.toTime_t() - currentdate.toTime_t() >=len) ) {\r
+// Thread.yield();\r
+// futuredate = new QDateTime(QDateTime.currentDateTime());\r
+// }\r
+// }\r
}\r