\r
package cx.fbn.nevernote.threads;\r
\r
-import java.io.ByteArrayInputStream;\r
-import java.io.ByteArrayOutputStream;\r
import java.io.File;\r
import java.io.FileInputStream;\r
import java.io.FileNotFoundException;\r
import org.apache.tika.parser.pdf.PDFParser;\r
import org.apache.tika.parser.rtf.RTFParser;\r
import org.apache.tika.sax.BodyContentHandler;\r
-import org.w3c.tidy.Tidy;\r
import org.xml.sax.ContentHandler;\r
import org.xml.sax.SAXException;\r
\r
private static int MAX_QUEUED_WAITING = 1000;\r
public boolean interrupt;\r
public boolean idle;\r
+ public boolean indexAttachmentsLocally = true;\r
public volatile IndexSignal signal;\r
\r
\r
e.printStackTrace();\r
}\r
}\r
+ logger.log(logger.EXTREME, "Shutting down database");\r
conn.dbShutdown();\r
+ logger.log(logger.EXTREME, "Database shut down. Exiting thread");\r
}\r
\r
// Reindex a note\r
public void indexNoteContent() {\r
\r
-// if (wordMap.size() > 0)\r
-// wordMap.clear();\r
logger.log(logger.EXTREME, "Entering indexRunner.indexNoteContent()");\r
\r
logger.log(logger.EXTREME, "Getting note content");\r
Note n = conn.getNoteTable().getNote(guid,true,false,true,true, true);\r
String data = n.getContent();\r
+ data = conn.getNoteTable().getNoteContentNoUTFConversion(n.getGuid());\r
\r
logger.log(logger.EXTREME, "Removing any encrypted data");\r
- data = removeEnCrypt(data);\r
+ data = removeEnCrypt(data.toString());\r
logger.log(logger.EXTREME, "Removing xml markups");\r
- Tidy tidy = new Tidy();\r
- tidy.getStderr().close(); // the listener will capture messages\r
- tidy.setXmlTags(true);\r
- byte html[] = data.getBytes();\r
- ByteArrayInputStream is = new ByteArrayInputStream(html);\r
- ByteArrayOutputStream os = new ByteArrayOutputStream();\r
- tidy.parse(is, os);\r
- String text = StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", "")) +" "+\r
- n.getTitle();\r
+ String text = removeTags(StringEscapeUtils.unescapeHtml(data) +" "+\r
+ n.getTitle());\r
\r
logger.log(logger.EXTREME, "Splitting words");\r
String[] result = text.toString().split(regex);\r
\r
logger.log(logger.EXTREME, "Number of words found: " +result.length);\r
for (int j=0; j<result.length && keepRunning; j++) {\r
- logger.log(logger.EXTREME, "Result word: " +result[j]);\r
- addToIndex(guid, result[j], "CONTENT");\r
+ if (!result[j].trim().equals("")) {\r
+ logger.log(logger.EXTREME, "Result word: " +result[j]);\r
+ addToIndex(guid, result[j], "CONTENT");\r
+ }\r
}\r
// If we were interrupted, we will reindex this note next time\r
if (Global.keepRunning) {\r
}\r
logger.log(logger.EXTREME, "Leaving indexRunner.indexNoteContent()");\r
}\r
+ \r
+ \r
+ private String removeTags(String text) {\r
+ StringBuffer buffer = new StringBuffer(text);\r
+ boolean inTag = false;\r
+ for (int i=buffer.length()-1; i>=0; i--) {\r
+ if (buffer.charAt(i) == '>')\r
+ inTag = true;\r
+ if (buffer.charAt(i) == '<')\r
+ inTag = false;\r
+ if (inTag || buffer.charAt(i) == '<')\r
+ buffer.deleteCharAt(i);\r
+ }\r
+ \r
+ return buffer.toString();\r
+ }\r
\r
\r
public synchronized boolean addWork(String request) {\r
}\r
}\r
\r
- if (Global.keepRunning) {\r
+ if (Global.keepRunning && indexAttachmentsLocally) {\r
indexResourceContent(guid);\r
}\r
- \r
+ \r
if (Global.keepRunning)\r
conn.getNoteTable().noteResourceTable.setIndexNeeded(guid,false);\r
}\r
if (word.length() > 0) {\r
// We have a good word, now let's trim off junk at the beginning or end\r
StringBuffer buffer = new StringBuffer(word.toLowerCase());\r
- for (int x = buffer.length()-1; x>=0; x--) {\r
- if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
- buffer = buffer.deleteCharAt(x);\r
- else\r
- x=-1;\r
- }\r
- // Things have been trimmed off the end, so reverse the string & repeat.\r
- buffer = buffer.reverse();\r
- for (int x = buffer.length()-1; x>=0 && keepRunning; x--) {\r
- if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
- buffer = buffer.deleteCharAt(x);\r
- else\r
- x=-1;\r
- }\r
- // Restore the string back to the proper order.\r
- buffer = buffer.reverse();\r
- \r
- if (buffer.length()>=Global.minimumWordCount) {\r
- conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), type, 100);\r
- }\r
+ conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), type, 100);\r
}\r
return;\r
}\r
}\r
\r
List<String> unindexedResources = conn.getNoteTable().noteResourceTable.getUnindexed();\r
- if (notes.size() > 0 && !started) {\r
+ if (unindexedResources.size() > 0 && !started) {\r
signal.indexStarted.emit();\r
started = true;\r
}\r
- for (int i=0; i>unindexedResources.size()&& !interrupt && keepRunning; i++) {\r
+ for (int i=0; i<unindexedResources.size()&& !interrupt && keepRunning; i++) {\r
guid = unindexedResources.get(i);\r
if (keepRunning) {\r
indexResource();\r