\r
package cx.fbn.nevernote.threads;\r
\r
+import java.io.ByteArrayInputStream;\r
+import java.io.ByteArrayOutputStream;\r
import java.util.concurrent.LinkedBlockingQueue;\r
\r
+import org.apache.commons.lang.StringEscapeUtils;\r
+import org.w3c.tidy.Tidy;\r
+\r
import com.evernote.edam.type.Note;\r
import com.evernote.edam.type.Resource;\r
import com.trolltech.qt.core.QByteArray;\r
import cx.fbn.nevernote.signals.NoteSignal;\r
import cx.fbn.nevernote.sql.DatabaseConnection;\r
import cx.fbn.nevernote.utilities.ApplicationLogger;\r
-import cx.fbn.nevernote.utilities.StringUtils;\r
\r
-//public class IndexRunner implements QRunnable {\r
public class IndexRunner extends QObject implements Runnable {\r
\r
private final ApplicationLogger logger;\r
private int indexType;\r
public final int CONTENT=1; \r
public final int RESOURCE=2;\r
- private boolean keepRunning;\r
-// public volatile int ID;\r
+ public boolean keepRunning;\r
private final QDomDocument doc;\r
- private final int threadID;\r
private static String regex = Global.getWordRegex();\r
private final DatabaseConnection conn;\r
private volatile LinkedBlockingQueue<String> workQueue;\r
-// private static int MAX_EMPTY_QUEUE_COUNT = 1;\r
private static int MAX_QUEUED_WAITING = 1000;\r
+\r
\r
\r
\r
- public IndexRunner(String logname) {\r
+ public IndexRunner(String logname, String u, String uid, String pswd, String cpswd) {\r
logger = new ApplicationLogger(logname);\r
- threadID = Global.indexThreadId;\r
- conn = new DatabaseConnection(logger, threadID);\r
+ conn = new DatabaseConnection(logger, u, uid, pswd, cpswd);\r
noteSignal = new NoteSignal();\r
resourceSignal = new NoteResourceSignal();\r
-// threadSignal = new ThreadSignal();\r
indexType = CONTENT;\r
guid = null;\r
keepRunning = true;\r
doc = new QDomDocument();\r
- workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING);\r
+ workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING); \r
}\r
\r
- \r
public void setIndexType(int t) {\r
indexType = t;\r
}\r
e.printStackTrace();\r
}\r
}\r
+ conn.dbShutdown();\r
}\r
\r
// Reindex a note\r
logger.log(logger.EXTREME, "Removing any encrypted data");\r
data = removeEnCrypt(data);\r
logger.log(logger.EXTREME, "Removing xml markups");\r
- String text = StringUtils.unescapeHTML(data.replaceAll("\\<.*?\\>", ""),0);\r
- \r
+ // These HTML characters need to be replaced by a space, or they'll cause words to jam together\r
+// data = data.toLowerCase().replace("<br>", " ").replace("<hr>", " ").replace("<p>", " ").replace("<href>", " ");\r
+// String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", ""));\r
+ Tidy tidy = new Tidy();\r
+ tidy.getStderr().close(); // the listener will capture messages\r
+ tidy.setXmlTags(true);\r
+ byte html[] = data.getBytes();\r
+ ByteArrayInputStream is = new ByteArrayInputStream(html);\r
+ ByteArrayOutputStream os = new ByteArrayOutputStream();\r
+ tidy.parse(is, os);\r
+ String text = StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", "")) +" "+\r
+ n.getTitle();\r
+ \r
logger.log(logger.EXTREME, "Splitting words");\r
String[] result = text.toString().split(regex);\r
logger.log(logger.EXTREME, "Deleting existing words for note from index");\r
for (int j=0; j<result.length && keepRunning; j++) {\r
logger.log(logger.EXTREME, "Result word: " +result[j]);\r
if (result[j].length() > 0) {\r
- if (Character.isLetterOrDigit(result[j].charAt(0))) {\r
- int len = result[j].length();\r
- StringBuffer buffer = new StringBuffer(result[j].toLowerCase());\r
- logger.log(logger.EXTREME, "Processing " +buffer);\r
- for (int k=len-1; k>=0 && keepRunning; k--) {\r
- if (!Character.isLetterOrDigit(result[j].charAt(k)))\r
- buffer.deleteCharAt(k);\r
- else\r
- k=-1;\r
- }\r
-\r
- if (buffer.length()>=Global.minimumWordCount) {\r
- logger.log(logger.EXTREME, "Adding " +buffer);\r
- conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), "CONTENT", 100);\r
- }\r
+ // We have a good word, now let's trim off junk at the beginning or end\r
+ StringBuffer buffer = new StringBuffer(result[j].toLowerCase());\r
+ for (int x = buffer.length()-1; x>=0; x--) {\r
+ if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
+ buffer = buffer.deleteCharAt(x);\r
+ else\r
+ x=-1;\r
+ }\r
+ // Things have been trimmed off the end, so reverse the string & repeat.\r
+ buffer = buffer.reverse();\r
+ for (int x = buffer.length()-1; x>=0; x--) {\r
+ if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
+ buffer = buffer.deleteCharAt(x);\r
+ else\r
+ x=-1;\r
+ }\r
+ // Restore the string back to the proper order.\r
+ buffer = buffer.reverse();\r
+ \r
+ logger.log(logger.EXTREME, "Processing " +buffer);\r
+ if (buffer.length()>=Global.minimumWordCount) {\r
+ logger.log(logger.EXTREME, "Adding " +buffer);\r
+ conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), "CONTENT", 100);\r
}\r
}\r
}\r