OSDN Git Service

Added logic to download all linked & shared data structures.
[neighbornote/NeighborNote.git] / src / cx / fbn / nevernote / threads / IndexRunner.java
index 129126e..62820c2 100644 (file)
 \r
 package cx.fbn.nevernote.threads;\r
 \r
+import java.io.ByteArrayInputStream;\r
+import java.io.ByteArrayOutputStream;\r
 import java.util.concurrent.LinkedBlockingQueue;\r
 \r
+import org.apache.commons.lang.StringEscapeUtils;\r
+import org.w3c.tidy.Tidy;\r
+\r
 import com.evernote.edam.type.Note;\r
 import com.evernote.edam.type.Resource;\r
 import com.trolltech.qt.core.QByteArray;\r
@@ -34,9 +39,7 @@ import cx.fbn.nevernote.signals.NoteResourceSignal;
 import cx.fbn.nevernote.signals.NoteSignal;\r
 import cx.fbn.nevernote.sql.DatabaseConnection;\r
 import cx.fbn.nevernote.utilities.ApplicationLogger;\r
-import cx.fbn.nevernote.utilities.StringUtils;\r
 \r
-//public class IndexRunner implements QRunnable {\r
 public class IndexRunner extends QObject implements Runnable {\r
        \r
        private final ApplicationLogger         logger;\r
@@ -47,33 +50,28 @@ public class IndexRunner extends QObject implements Runnable {
        private int                                                     indexType;\r
        public final int                                        CONTENT=1; \r
        public final int                                        RESOURCE=2;\r
-       private boolean                                         keepRunning;\r
-//     public volatile int                                     ID;\r
+       public boolean                                          keepRunning;\r
        private final QDomDocument                      doc;\r
-       private final int                                                       threadID;\r
        private static String                           regex = Global.getWordRegex();\r
        private final DatabaseConnection        conn;\r
        private volatile LinkedBlockingQueue<String> workQueue;\r
-//     private static int MAX_EMPTY_QUEUE_COUNT = 1;\r
        private static int MAX_QUEUED_WAITING = 1000;\r
+\r
        \r
 \r
        \r
-       public IndexRunner(String logname) {\r
+       public IndexRunner(String logname, String u, String uid, String pswd, String cpswd) {\r
                logger = new ApplicationLogger(logname);\r
-               threadID = Global.indexThreadId;\r
-               conn = new DatabaseConnection(logger, threadID);\r
+               conn = new DatabaseConnection(logger, u, uid, pswd, cpswd);\r
                noteSignal = new NoteSignal();\r
                resourceSignal = new NoteResourceSignal();\r
-//             threadSignal = new ThreadSignal();\r
                indexType = CONTENT;\r
                guid = null;\r
                keepRunning = true;\r
                doc = new QDomDocument();\r
-               workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING);\r
+               workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING);  \r
        }\r
        \r
-       \r
        public void setIndexType(int t) {\r
                indexType = t;\r
        }\r
@@ -120,6 +118,7 @@ public class IndexRunner extends QObject implements Runnable {
                                e.printStackTrace();\r
                        }\r
                }\r
+               conn.dbShutdown();\r
        }\r
        \r
        // Reindex a note\r
@@ -133,8 +132,19 @@ public class IndexRunner extends QObject implements Runnable {
                logger.log(logger.EXTREME, "Removing any encrypted data");\r
                data = removeEnCrypt(data);\r
                logger.log(logger.EXTREME, "Removing xml markups");\r
-               String text = StringUtils.unescapeHTML(data.replaceAll("\\<.*?\\>", ""),0);\r
-               \r
+               // These HTML characters need to be replaced by a space, or they'll cause words to jam together\r
+//             data = data.toLowerCase().replace("<br>", " ").replace("<hr>", " ").replace("<p>", " ").replace("<href>", " ");\r
+//             String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", ""));\r
+               Tidy tidy = new Tidy();\r
+               tidy.getStderr().close();  // the listener will capture messages\r
+               tidy.setXmlTags(true);\r
+               byte html[] = data.getBytes();\r
+               ByteArrayInputStream is = new ByteArrayInputStream(html);\r
+               ByteArrayOutputStream os = new ByteArrayOutputStream();\r
+               tidy.parse(is, os);\r
+               String text =  StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", "")) +" "+\r
+               n.getTitle();\r
+                               \r
                logger.log(logger.EXTREME, "Splitting words");\r
                String[] result = text.toString().split(regex);\r
                logger.log(logger.EXTREME, "Deleting existing words for note from index");\r
@@ -144,21 +154,29 @@ public class IndexRunner extends QObject implements Runnable {
                for (int j=0; j<result.length && keepRunning; j++) {\r
                        logger.log(logger.EXTREME, "Result word: " +result[j]);\r
                        if (result[j].length() > 0) {\r
-                               if (Character.isLetterOrDigit(result[j].charAt(0))) {\r
-                                       int len = result[j].length();\r
-                                       StringBuffer buffer = new StringBuffer(result[j].toLowerCase());\r
-                                       logger.log(logger.EXTREME, "Processing " +buffer);\r
-                                       for (int k=len-1; k>=0 && keepRunning; k--) {\r
-                                               if (!Character.isLetterOrDigit(result[j].charAt(k)))\r
-                                                       buffer.deleteCharAt(k);\r
-                                               else\r
-                                                       k=-1;\r
-                                       }\r
-\r
-                                       if (buffer.length()>=Global.minimumWordCount) {\r
-                                               logger.log(logger.EXTREME, "Adding " +buffer);\r
-                                               conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), "CONTENT", 100);\r
-                                       }\r
+                               // We have a good word, now let's trim off junk at the beginning or end\r
+                               StringBuffer buffer = new StringBuffer(result[j].toLowerCase());\r
+                               for (int x = buffer.length()-1; x>=0; x--) {\r
+                                       if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
+                                               buffer = buffer.deleteCharAt(x);\r
+                                       else\r
+                                               x=-1;\r
+                               }\r
+                               // Things have been trimmed off the end, so reverse the string & repeat.\r
+                               buffer = buffer.reverse();\r
+                               for (int x = buffer.length()-1; x>=0; x--) {\r
+                                       if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
+                                               buffer = buffer.deleteCharAt(x);\r
+                                       else\r
+                                               x=-1;\r
+                               }\r
+                               // Restore the string back to the proper order.\r
+                               buffer = buffer.reverse();\r
+                       \r
+                               logger.log(logger.EXTREME, "Processing " +buffer);\r
+                               if (buffer.length()>=Global.minimumWordCount) {\r
+                                       logger.log(logger.EXTREME, "Adding " +buffer);\r
+                                       conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), "CONTENT", 100);\r
                                }\r
                        }\r
                }\r