OSDN Git Service

Correct problem parsing notes where carriage returns did not separate some XML lines...
authorRandy Baumgarte <randy@fbn.cx>
Mon, 27 Sep 2010 23:09:09 +0000 (19:09 -0400)
committerRandy Baumgarte <randy@fbn.cx>
Fri, 1 Oct 2010 16:43:15 +0000 (12:43 -0400)
src/cx/fbn/nevernote/threads/IndexRunner.java

index 5abecfc..1c02862 100644 (file)
 \r
 package cx.fbn.nevernote.threads;\r
 \r
+import java.io.ByteArrayInputStream;\r
+import java.io.ByteArrayOutputStream;\r
 import java.util.concurrent.LinkedBlockingQueue;\r
 \r
 import org.apache.commons.lang.StringEscapeUtils;\r
+import org.w3c.tidy.Tidy;\r
 \r
 import com.evernote.edam.type.Note;\r
 import com.evernote.edam.type.Resource;\r
@@ -37,7 +40,6 @@ import cx.fbn.nevernote.signals.NoteSignal;
 import cx.fbn.nevernote.sql.DatabaseConnection;\r
 import cx.fbn.nevernote.utilities.ApplicationLogger;\r
 \r
-//public class IndexRunner implements QRunnable {\r
 public class IndexRunner extends QObject implements Runnable {\r
        \r
        private final ApplicationLogger         logger;\r
@@ -49,12 +51,10 @@ public class IndexRunner extends QObject implements Runnable {
        public final int                                        CONTENT=1; \r
        public final int                                        RESOURCE=2;\r
        private boolean                                         keepRunning;\r
-//     public volatile int                                     ID;\r
        private final QDomDocument                      doc;\r
        private static String                           regex = Global.getWordRegex();\r
        private final DatabaseConnection        conn;\r
        private volatile LinkedBlockingQueue<String> workQueue;\r
-//     private static int MAX_EMPTY_QUEUE_COUNT = 1;\r
        private static int MAX_QUEUED_WAITING = 1000;\r
 \r
        \r
@@ -65,15 +65,13 @@ public class IndexRunner extends QObject implements Runnable {
                conn = new DatabaseConnection(logger, u, uid, pswd, cpswd);\r
                noteSignal = new NoteSignal();\r
                resourceSignal = new NoteResourceSignal();\r
-//             threadSignal = new ThreadSignal();\r
                indexType = CONTENT;\r
                guid = null;\r
                keepRunning = true;\r
                doc = new QDomDocument();\r
-               workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING);\r
+               workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING);  \r
        }\r
        \r
-       \r
        public void setIndexType(int t) {\r
                indexType = t;\r
        }\r
@@ -134,9 +132,18 @@ public class IndexRunner extends QObject implements Runnable {
                logger.log(logger.EXTREME, "Removing any encrypted data");\r
                data = removeEnCrypt(data);\r
                logger.log(logger.EXTREME, "Removing xml markups");\r
-               String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", ""));\r
-\r
-               \r
+               // These HTML characters need to be replaced by a space, or they'll cause words to jam together\r
+//             data = data.toLowerCase().replace("<br>", " ").replace("<hr>", " ").replace("<p>", " ").replace("<href>", " ");\r
+//             String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", ""));\r
+               Tidy tidy = new Tidy();\r
+               tidy.getStderr().close();  // the listener will capture messages\r
+               tidy.setXmlTags(true);\r
+               byte html[] = data.getBytes();\r
+               ByteArrayInputStream is = new ByteArrayInputStream(html);\r
+               ByteArrayOutputStream os = new ByteArrayOutputStream();\r
+               tidy.parse(is, os);\r
+               String text =  StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", ""));\r
+                               \r
                logger.log(logger.EXTREME, "Splitting words");\r
                String[] result = text.toString().split(regex);\r
                logger.log(logger.EXTREME, "Deleting existing words for note from index");\r
@@ -146,21 +153,29 @@ public class IndexRunner extends QObject implements Runnable {
                for (int j=0; j<result.length && keepRunning; j++) {\r
                        logger.log(logger.EXTREME, "Result word: " +result[j]);\r
                        if (result[j].length() > 0) {\r
-                               if (Character.isLetterOrDigit(result[j].charAt(0))) {\r
-                                       int len = result[j].length();\r
-                                       StringBuffer buffer = new StringBuffer(result[j].toLowerCase());\r
-                                       logger.log(logger.EXTREME, "Processing " +buffer);\r
-                                       for (int k=len-1; k>=0 && keepRunning; k--) {\r
-                                               if (!Character.isLetterOrDigit(result[j].charAt(k)))\r
-                                                       buffer.deleteCharAt(k);\r
-                                               else\r
-                                                       k=-1;\r
-                                       }\r
-\r
-                                       if (buffer.length()>=Global.minimumWordCount) {\r
-                                               logger.log(logger.EXTREME, "Adding " +buffer);\r
-                                               conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), "CONTENT", 100);\r
-                                       }\r
+                               // We have a good word, now let's trim off junk at the beginning or end\r
+                               StringBuffer buffer = new StringBuffer(result[j].toLowerCase());\r
+                               for (int x = buffer.length()-1; x>=0; x--) {\r
+                                       if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
+                                               buffer = buffer.deleteCharAt(x);\r
+                                       else\r
+                                               x=-1;\r
+                               }\r
+                               // Things have been trimmed off the end, so reverse the string & repeat.\r
+                               buffer = buffer.reverse();\r
+                               for (int x = buffer.length()-1; x>=0; x--) {\r
+                                       if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
+                                               buffer = buffer.deleteCharAt(x);\r
+                                       else\r
+                                               x=-1;\r
+                               }\r
+                               // Restore the string back to the proper order.\r
+                               buffer = buffer.reverse();\r
+                       \r
+                               logger.log(logger.EXTREME, "Processing " +buffer);\r
+                               if (buffer.length()>=Global.minimumWordCount) {\r
+                                       logger.log(logger.EXTREME, "Adding " +buffer);\r
+                                       conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), "CONTENT", 100);\r
                                }\r
                        }\r
                }\r