OSDN Git Service

Added more thumbnail logic & correct search bugs.
[neighbornote/NeighborNote.git] / src / cx / fbn / nevernote / threads / IndexRunner.java
1 /*\r
2  * This file is part of NeverNote \r
3  * Copyright 2009 Randy Baumgarte\r
4  * \r
5  * This file may be licensed under the terms of of the\r
6  * GNU General Public License Version 2 (the ``GPL'').\r
7  *\r
8  * Software distributed under the License is distributed\r
9  * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either\r
10  * express or implied. See the GPL for the specific language\r
11  * governing rights and limitations.\r
12  *\r
13  * You should have received a copy of the GPL along with this\r
14  * program. If not, go to http://www.gnu.org/licenses/gpl.html\r
15  * or write to the Free Software Foundation, Inc.,\r
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\r
17  *\r
18 */\r
19 \r
20 package cx.fbn.nevernote.threads;\r
21 \r
22 import java.io.ByteArrayInputStream;\r
23 import java.io.ByteArrayOutputStream;\r
24 import java.util.concurrent.LinkedBlockingQueue;\r
25 \r
26 import org.apache.commons.lang.StringEscapeUtils;\r
27 import org.w3c.tidy.Tidy;\r
28 \r
29 import com.evernote.edam.type.Note;\r
30 import com.evernote.edam.type.Resource;\r
31 import com.trolltech.qt.core.QByteArray;\r
32 import com.trolltech.qt.core.QObject;\r
33 import com.trolltech.qt.xml.QDomDocument;\r
34 import com.trolltech.qt.xml.QDomElement;\r
35 import com.trolltech.qt.xml.QDomNodeList;\r
36 \r
37 import cx.fbn.nevernote.Global;\r
38 import cx.fbn.nevernote.signals.NoteResourceSignal;\r
39 import cx.fbn.nevernote.signals.NoteSignal;\r
40 import cx.fbn.nevernote.sql.DatabaseConnection;\r
41 import cx.fbn.nevernote.utilities.ApplicationLogger;\r
42 \r
43 public class IndexRunner extends QObject implements Runnable {\r
44         \r
45         private final ApplicationLogger         logger;\r
46         private String                                          guid;\r
47         private QByteArray                                      resourceBinary;\r
48         public volatile NoteSignal                      noteSignal;\r
49         public volatile NoteResourceSignal      resourceSignal;\r
50         private int                                                     indexType;\r
51         public final int                                        CONTENT=1; \r
52         public final int                                        RESOURCE=2;\r
53         public boolean                                          keepRunning;\r
54         private final QDomDocument                      doc;\r
55         private static String                           regex = Global.getWordRegex();\r
56         private final DatabaseConnection        conn;\r
57         private volatile LinkedBlockingQueue<String> workQueue;\r
58         private static int MAX_QUEUED_WAITING = 1000;\r
59 \r
60         \r
61 \r
62         \r
63         public IndexRunner(String logname, String u, String uid, String pswd, String cpswd) {\r
64                 logger = new ApplicationLogger(logname);\r
65                 conn = new DatabaseConnection(logger, u, uid, pswd, cpswd);\r
66                 noteSignal = new NoteSignal();\r
67                 resourceSignal = new NoteResourceSignal();\r
68                 indexType = CONTENT;\r
69                 guid = null;\r
70                 keepRunning = true;\r
71                 doc = new QDomDocument();\r
72                 workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING);  \r
73         }\r
74         \r
75         public void setIndexType(int t) {\r
76                 indexType = t;\r
77         }\r
78         \r
79         \r
80         @Override\r
81         public void run() {\r
82                 thread().setPriority(Thread.MIN_PRIORITY);\r
83                 logger.log(logger.EXTREME, "Starting index thread ");\r
84                 while (keepRunning) {\r
85                         try {\r
86                                 String work = workQueue.take();\r
87                                 if (work.startsWith("CONTENT")) {\r
88                                         work = work.replace("CONTENT ", "");\r
89                                         guid = work;\r
90                                         indexType = CONTENT;\r
91                                 }\r
92                                 if (work.startsWith("RESOURCE")) {\r
93                                         work = work.replace("RESOURCE ", "");\r
94                                         guid = work;\r
95                                         indexType = RESOURCE;\r
96                                 }\r
97                                 if (work.startsWith("STOP")) {\r
98                                         keepRunning = false;\r
99                                         guid = work;\r
100                                 }\r
101                                 if (guid == null || guid.trim().equals("")) {\r
102                                         setIndexType(0);\r
103                                         resourceSignal.resourceIndexed.emit("null or empty guid");\r
104                                 }\r
105                                 logger.log(logger.EXTREME, "Type:" +indexType);\r
106                                 if (indexType == CONTENT && keepRunning) {\r
107                                         logger.log(logger.MEDIUM, "Indexing note: "+guid);\r
108                                         indexNoteContent();\r
109                                         setIndexType(0);\r
110                                 }\r
111                                 if (indexType == RESOURCE && keepRunning) {\r
112                                         logger.log(logger.MEDIUM, "Indexing resource: "+guid);\r
113                                         indexResource();\r
114                                         setIndexType(0);\r
115                                 }\r
116                         } catch (InterruptedException e) {\r
117                                 // TODO Auto-generated catch block\r
118                                 e.printStackTrace();\r
119                         }\r
120                 }\r
121                 conn.dbShutdown();\r
122         }\r
123         \r
124         // Reindex a note\r
125         public void indexNoteContent() {\r
126                 logger.log(logger.EXTREME, "Entering indexRunner.indexNoteContent()");\r
127                 \r
128                 logger.log(logger.EXTREME, "Getting note content");\r
129                 Note n = conn.getNoteTable().getNote(guid,true,false,true,true, true);\r
130                 String data = n.getContent();\r
131                 \r
132                 logger.log(logger.EXTREME, "Removing any encrypted data");\r
133                 data = removeEnCrypt(data);\r
134                 logger.log(logger.EXTREME, "Removing xml markups");\r
135                 // These HTML characters need to be replaced by a space, or they'll cause words to jam together\r
136 //              data = data.toLowerCase().replace("<br>", " ").replace("<hr>", " ").replace("<p>", " ").replace("<href>", " ");\r
137 //              String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", ""));\r
138                 Tidy tidy = new Tidy();\r
139                 tidy.getStderr().close();  // the listener will capture messages\r
140                 tidy.setXmlTags(true);\r
141                 byte html[] = data.getBytes();\r
142                 ByteArrayInputStream is = new ByteArrayInputStream(html);\r
143                 ByteArrayOutputStream os = new ByteArrayOutputStream();\r
144                 tidy.parse(is, os);\r
145                 String text =  StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", "")) +" "+\r
146                 n.getTitle();\r
147                                 \r
148                 logger.log(logger.EXTREME, "Splitting words");\r
149                 String[] result = text.toString().split(regex);\r
150                 logger.log(logger.EXTREME, "Deleting existing words for note from index");\r
151                 conn.getWordsTable().expungeFromWordIndex(guid, "CONTENT");\r
152                 \r
153                 logger.log(logger.EXTREME, "Number of words found: " +result.length);\r
154                 for (int j=0; j<result.length && keepRunning; j++) {\r
155                         logger.log(logger.EXTREME, "Result word: " +result[j]);\r
156                         if (result[j].length() > 0) {\r
157                                 // We have a good word, now let's trim off junk at the beginning or end\r
158                                 StringBuffer buffer = new StringBuffer(result[j].toLowerCase());\r
159                                 for (int x = buffer.length()-1; x>=0; x--) {\r
160                                         if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
161                                                 buffer = buffer.deleteCharAt(x);\r
162                                         else\r
163                                                 x=-1;\r
164                                 }\r
165                                 // Things have been trimmed off the end, so reverse the string & repeat.\r
166                                 buffer = buffer.reverse();\r
167                                 for (int x = buffer.length()-1; x>=0; x--) {\r
168                                         if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
169                                                 buffer = buffer.deleteCharAt(x);\r
170                                         else\r
171                                                 x=-1;\r
172                                 }\r
173                                 // Restore the string back to the proper order.\r
174                                 buffer = buffer.reverse();\r
175                         \r
176                                 logger.log(logger.EXTREME, "Processing " +buffer);\r
177                                 if (buffer.length()>=Global.minimumWordCount) {\r
178                                         logger.log(logger.EXTREME, "Adding " +buffer);\r
179                                         conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), "CONTENT", 100);\r
180                                 }\r
181                         }\r
182                 }\r
183                 // If we were interrupted, we will reindex this note next time\r
184                 if (Global.keepRunning) {\r
185                         logger.log(logger.EXTREME, "Resetting note guid needed");\r
186                         conn.getNoteTable().setIndexNeeded(guid, false);\r
187                 }\r
188                 logger.log(logger.EXTREME, "Leaving indexRunner.indexNoteContent()");\r
189         }\r
190 \r
191         \r
192         public synchronized boolean addWork(String request) {\r
193                 if (workQueue.size() == 0) {\r
194                         workQueue.offer(request);\r
195                         return true;\r
196                 }\r
197                 return false;\r
198         }\r
199         \r
200         public synchronized int getWorkQueueSize() {\r
201                 return workQueue.size();\r
202         }\r
203         \r
204         public void indexResource() {\r
205                 \r
206                 if (guid == null)\r
207                         return;\r
208                 \r
209                 Resource r = conn.getNoteTable().noteResourceTable.getNoteResourceRecognition(guid);\r
210                 if (r == null || r.getRecognition() == null || r.getRecognition().getBody() == null || r.getRecognition().getBody().length == 0) \r
211                         resourceBinary = new QByteArray(" ");\r
212                 else\r
213                         resourceBinary = new QByteArray(r.getRecognition().getBody());\r
214                 \r
215                 conn.getWordsTable().expungeFromWordIndex(guid, "RESOURCE");\r
216                         \r
217                 doc.setContent(resourceBinary);\r
218                 QDomElement docElem = doc.documentElement();\r
219                         \r
220                 // look for text tags\r
221                 QDomNodeList anchors = docElem.elementsByTagName("t");\r
222                 for (int i=0; i<anchors.length() && keepRunning; i++) {\r
223                         QDomElement enmedia = anchors.at(i).toElement();\r
224                         String weight = new String(enmedia.attribute("w"));\r
225                         String text = new String(enmedia.text()).toLowerCase();\r
226                         if (!text.equals("")) {\r
227                                 conn.getWordsTable().addWordToNoteIndex(guid, text, "RESOURCE", new Integer(weight));\r
228                         }\r
229                 }\r
230                 if (Global.keepRunning)\r
231                         conn.getNoteTable().noteResourceTable.setIndexNeeded(guid,false);\r
232         }\r
233 \r
234         \r
235         private String removeEnCrypt(String content) {\r
236                 int index = content.indexOf("<en-crypt");\r
237                 int endPos;\r
238                 boolean tagFound = true;\r
239                 while (tagFound && keepRunning) {\r
240                         endPos = content.indexOf("</en-crypt>", index)+11;\r
241                         if (endPos > -1 && index > -1) {\r
242                                 content = content.substring(0,index)+content.substring(endPos);\r
243                                 index = content.indexOf("<en-crypt");\r
244                         } else {\r
245                                 tagFound = false;\r
246                         }\r
247                 }\r
248                 return content;\r
249         }\r
250 \r
251         \r
252         \r
253         \r
254 \r
255 }\r