OSDN Git Service

Fix bug where resources were indexed under the resource GUID rather than the note...
[neighbornote/NeighborNote.git] / src / cx / fbn / nevernote / threads / IndexRunner.java
1 /*\r
2  * This file is part of NeverNote \r
3  * Copyright 2009 Randy Baumgarte\r
4  * \r
5  * This file may be licensed under the terms of of the\r
6  * GNU General Public License Version 2 (the ``GPL'').\r
7  *\r
8  * Software distributed under the License is distributed\r
9  * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either\r
10  * express or implied. See the GPL for the specific language\r
11  * governing rights and limitations.\r
12  *\r
13  * You should have received a copy of the GPL along with this\r
14  * program. If not, go to http://www.gnu.org/licenses/gpl.html\r
15  * or write to the Free Software Foundation, Inc.,\r
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\r
17  *\r
18 */\r
19 \r
20 package cx.fbn.nevernote.threads;\r
21 \r
22 import java.io.ByteArrayInputStream;\r
23 import java.io.ByteArrayOutputStream;\r
24 import java.io.File;\r
25 import java.io.FileInputStream;\r
26 import java.io.FileNotFoundException;\r
27 import java.io.IOException;\r
28 import java.io.InputStream;\r
29 import java.util.concurrent.LinkedBlockingQueue;\r
30 \r
31 import org.apache.commons.lang.StringEscapeUtils;\r
32 import org.apache.tika.exception.TikaException;\r
33 import org.apache.tika.metadata.Metadata;\r
34 import org.apache.tika.parser.ParseContext;\r
35 import org.apache.tika.parser.microsoft.OfficeParser;\r
36 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;\r
37 import org.apache.tika.parser.odf.OpenDocumentContentParser;\r
38 import org.apache.tika.parser.pdf.PDFParser;\r
39 import org.apache.tika.parser.rtf.RTFParser;\r
40 import org.apache.tika.sax.BodyContentHandler;\r
41 import org.w3c.tidy.Tidy;\r
42 import org.xml.sax.ContentHandler;\r
43 import org.xml.sax.SAXException;\r
44 \r
45 import com.evernote.edam.type.Data;\r
46 import com.evernote.edam.type.Note;\r
47 import com.evernote.edam.type.Resource;\r
48 import com.trolltech.qt.core.QByteArray;\r
49 import com.trolltech.qt.core.QIODevice.OpenModeFlag;\r
50 import com.trolltech.qt.core.QObject;\r
51 import com.trolltech.qt.core.QTemporaryFile;\r
52 import com.trolltech.qt.xml.QDomDocument;\r
53 import com.trolltech.qt.xml.QDomElement;\r
54 import com.trolltech.qt.xml.QDomNodeList;\r
55 \r
56 import cx.fbn.nevernote.Global;\r
57 import cx.fbn.nevernote.signals.NoteResourceSignal;\r
58 import cx.fbn.nevernote.signals.NoteSignal;\r
59 import cx.fbn.nevernote.sql.DatabaseConnection;\r
60 import cx.fbn.nevernote.utilities.ApplicationLogger;\r
61 \r
62 public class IndexRunner extends QObject implements Runnable {\r
63         \r
64         private final ApplicationLogger         logger;\r
65         private String                                          guid;\r
66         private QByteArray                                      resourceBinary;\r
67         public volatile NoteSignal                      noteSignal;\r
68         public volatile NoteResourceSignal      resourceSignal;\r
69         private int                                                     indexType;\r
70         public final int                                        CONTENT=1; \r
71         public final int                                        RESOURCE=2;\r
72         public boolean                                          keepRunning;\r
73         private final QDomDocument                      doc;\r
74         private static String                           regex = Global.getWordRegex();\r
75         private final DatabaseConnection        conn;\r
76         private volatile LinkedBlockingQueue<String> workQueue;\r
77         private static int MAX_QUEUED_WAITING = 1000;\r
78 \r
79         \r
80 \r
81         \r
82         public IndexRunner(String logname, String u, String uid, String pswd, String cpswd) {\r
83                 logger = new ApplicationLogger(logname);\r
84                 conn = new DatabaseConnection(logger, u, uid, pswd, cpswd);\r
85                 noteSignal = new NoteSignal();\r
86                 resourceSignal = new NoteResourceSignal();\r
87                 indexType = CONTENT;\r
88                 guid = null;\r
89                 keepRunning = true;\r
90                 doc = new QDomDocument();\r
91                 workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING);  \r
92         }\r
93         \r
94         public void setIndexType(int t) {\r
95                 indexType = t;\r
96         }\r
97         \r
98         \r
99         @Override\r
100         public void run() {\r
101                 thread().setPriority(Thread.MIN_PRIORITY);\r
102                 logger.log(logger.EXTREME, "Starting index thread ");\r
103                 while (keepRunning) {\r
104                         try {\r
105                                 String work = workQueue.take();\r
106                                 if (work.startsWith("CONTENT")) {\r
107                                         work = work.replace("CONTENT ", "");\r
108                                         guid = work;\r
109                                         indexType = CONTENT;\r
110                                 }\r
111                                 if (work.startsWith("RESOURCE")) {\r
112                                         work = work.replace("RESOURCE ", "");\r
113                                         guid = work;\r
114                                         indexType = RESOURCE;\r
115                                 }\r
116                                 if (work.startsWith("STOP")) {\r
117                                         keepRunning = false;\r
118                                         guid = work;\r
119                                 }\r
120                                 if (guid == null || guid.trim().equals("")) {\r
121                                         setIndexType(0);\r
122                                         resourceSignal.resourceIndexed.emit("null or empty guid");\r
123                                 }\r
124                                 logger.log(logger.EXTREME, "Type:" +indexType);\r
125                                 if (indexType == CONTENT && keepRunning) {\r
126                                         logger.log(logger.MEDIUM, "Indexing note: "+guid);\r
127                                         indexNoteContent();\r
128                                         setIndexType(0);\r
129                                 }\r
130                                 if (indexType == RESOURCE && keepRunning) {\r
131                                         logger.log(logger.MEDIUM, "Indexing resource: "+guid);\r
132                                         indexResource();\r
133                                         setIndexType(0);\r
134                                 }\r
135                         } catch (InterruptedException e) {\r
136                                 // TODO Auto-generated catch block\r
137                                 e.printStackTrace();\r
138                         }\r
139                 }\r
140                 conn.dbShutdown();\r
141         }\r
142         \r
143         // Reindex a note\r
144         public void indexNoteContent() {\r
145                 logger.log(logger.EXTREME, "Entering indexRunner.indexNoteContent()");\r
146                 \r
147                 logger.log(logger.EXTREME, "Getting note content");\r
148                 Note n = conn.getNoteTable().getNote(guid,true,false,true,true, true);\r
149                 String data = n.getContent();\r
150                 \r
151                 logger.log(logger.EXTREME, "Removing any encrypted data");\r
152                 data = removeEnCrypt(data);\r
153                 logger.log(logger.EXTREME, "Removing xml markups");\r
154                 // These HTML characters need to be replaced by a space, or they'll cause words to jam together\r
155 //              data = data.toLowerCase().replace("<br>", " ").replace("<hr>", " ").replace("<p>", " ").replace("<href>", " ");\r
156 //              String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", ""));\r
157                 Tidy tidy = new Tidy();\r
158                 tidy.getStderr().close();  // the listener will capture messages\r
159                 tidy.setXmlTags(true);\r
160                 byte html[] = data.getBytes();\r
161                 ByteArrayInputStream is = new ByteArrayInputStream(html);\r
162                 ByteArrayOutputStream os = new ByteArrayOutputStream();\r
163                 tidy.parse(is, os);\r
164                 String text =  StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", "")) +" "+\r
165                 n.getTitle();\r
166                                 \r
167                 logger.log(logger.EXTREME, "Splitting words");\r
168                 String[] result = text.toString().split(regex);\r
169                 logger.log(logger.EXTREME, "Deleting existing words for note from index");\r
170                 conn.getWordsTable().expungeFromWordIndex(guid, "CONTENT");\r
171                 \r
172                 logger.log(logger.EXTREME, "Number of words found: " +result.length);\r
173                 for (int j=0; j<result.length && keepRunning; j++) {\r
174                         logger.log(logger.EXTREME, "Result word: " +result[j]);\r
175                         addToIndex(guid, result[j], "CONTENT");\r
176                 }\r
177                 // If we were interrupted, we will reindex this note next time\r
178                 if (Global.keepRunning) {\r
179                         logger.log(logger.EXTREME, "Resetting note guid needed");\r
180                         conn.getNoteTable().setIndexNeeded(guid, false);\r
181                 }\r
182                 logger.log(logger.EXTREME, "Leaving indexRunner.indexNoteContent()");\r
183         }\r
184 \r
185         \r
186         public synchronized boolean addWork(String request) {\r
187                 if (workQueue.size() == 0) {\r
188                         workQueue.offer(request);\r
189                         return true;\r
190                 }\r
191                 return false;\r
192         }\r
193         \r
194         public synchronized int getWorkQueueSize() {\r
195                 return workQueue.size();\r
196         }\r
197         \r
198         public void indexResource() {\r
199                 \r
200                 if (guid == null)\r
201                         return;\r
202                 \r
203                 Resource r = conn.getNoteTable().noteResourceTable.getNoteResourceRecognition(guid);\r
204                 if (r == null || r.getRecognition() == null || r.getRecognition().getBody() == null || r.getRecognition().getBody().length == 0) \r
205                         resourceBinary = new QByteArray(" ");\r
206                 else\r
207                         resourceBinary = new QByteArray(r.getRecognition().getBody());\r
208                 \r
209                 conn.getWordsTable().expungeFromWordIndex(r.getNoteGuid(), "RESOURCE");\r
210                 // This is due to an old bug & can be removed at some point in the future 11/23/2010\r
211                 conn.getWordsTable().expungeFromWordIndex(guid, "RESOURCE");   \r
212                         \r
213                 doc.setContent(resourceBinary);\r
214                 QDomElement docElem = doc.documentElement();\r
215                         \r
216                 // look for text tags\r
217                 QDomNodeList anchors = docElem.elementsByTagName("t");\r
218                 for (int i=0; i<anchors.length() && keepRunning; i++) {\r
219                         QDomElement enmedia = anchors.at(i).toElement();\r
220                         String weight = new String(enmedia.attribute("w"));\r
221                         String text = new String(enmedia.text()).toLowerCase();\r
222                         if (!text.equals("")) {\r
223                                 conn.getWordsTable().addWordToNoteIndex(r.getNoteGuid(), text, "RESOURCE", new Integer(weight));\r
224                         }\r
225                 }\r
226                 \r
227                 if (Global.keepRunning) {\r
228                         indexResourceContent(guid);\r
229                 }\r
230                 \r
231                 if (Global.keepRunning)\r
232                         conn.getNoteTable().noteResourceTable.setIndexNeeded(guid,false);\r
233         }\r
234         \r
235         private void indexResourceContent(String guid) {\r
236                 Resource r = conn.getNoteTable().noteResourceTable.getNoteResource(guid, true);\r
237                 if (r.getMime().equalsIgnoreCase("application/pdf")) {\r
238                         indexResourcePDF(r);\r
239                         return;\r
240                 }\r
241                 if (r.getMime().equalsIgnoreCase("application/docx") || \r
242                         r.getMime().equalsIgnoreCase("application/xlsx") || \r
243                         r.getMime().equalsIgnoreCase("application/pptx")) {\r
244                         indexResourceOOXML(r);\r
245                         return;\r
246                 }\r
247                 if (r.getMime().equalsIgnoreCase("application/vsd") ||\r
248                         r.getMime().equalsIgnoreCase("application/ppt") ||\r
249                         r.getMime().equalsIgnoreCase("application/xls") ||\r
250                         r.getMime().equalsIgnoreCase("application/msg") ||\r
251                         r.getMime().equalsIgnoreCase("application/doc")) {\r
252                                 indexResourceOffice(r);\r
253                                 return;\r
254                 }\r
255                 if (r.getMime().equalsIgnoreCase("application/rtf")) {\r
256                                         indexResourceRTF(r);\r
257                                         return;\r
258                 }\r
259                 if (r.getMime().equalsIgnoreCase("application/odf")) {\r
260                         indexResourceODF(r);\r
261                         return;\r
262                 }\r
263         }\r
264 \r
265 \r
266         private void indexResourceRTF(Resource r) {\r
267                 QTemporaryFile f = writeResource(r.getData());\r
268                 if (!keepRunning) {\r
269                         return;\r
270                 }\r
271                 \r
272                 InputStream input;\r
273                 try {\r
274                         input = new FileInputStream(new File(f.fileName()));\r
275                         ContentHandler textHandler = new BodyContentHandler();\r
276                         Metadata metadata = new Metadata();\r
277                         RTFParser parser = new RTFParser();     \r
278                         ParseContext context = new ParseContext();\r
279                         parser.parse(input, textHandler, metadata, context);\r
280                         String[] result = textHandler.toString().split(regex);\r
281                         for (int i=0; i<result.length && keepRunning; i++) {\r
282                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
283                         }\r
284                         input.close();\r
285                 \r
286                         f.close();\r
287                 } catch (java.lang.ClassCastException e) {\r
288                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
289                 } catch (FileNotFoundException e) {\r
290                         // TODO Auto-generated catch block\r
291                         e.printStackTrace();\r
292                 } catch (IOException e) {\r
293                         // TODO Auto-generated catch block\r
294                         e.printStackTrace();\r
295                 } catch (SAXException e) {\r
296                         // TODO Auto-generated catch block\r
297                         e.printStackTrace();\r
298                 } catch (TikaException e) {\r
299                         // TODO Auto-generated catch block\r
300                         e.printStackTrace();\r
301                 }\r
302         }\r
303 \r
304         \r
305         private void indexResourceODF(Resource r) {\r
306                 QTemporaryFile f = writeResource(r.getData());\r
307                 if (!keepRunning) {\r
308                         return;\r
309                 }\r
310                 \r
311                 InputStream input;\r
312                 try {\r
313                         input = new FileInputStream(new File(f.fileName()));\r
314                         ContentHandler textHandler = new BodyContentHandler();\r
315                         Metadata metadata = new Metadata();\r
316                         OpenDocumentContentParser parser = new OpenDocumentContentParser();     \r
317                         ParseContext context = new ParseContext();\r
318                         parser.parse(input, textHandler, metadata, context);\r
319                         String[] result = textHandler.toString().split(regex);\r
320                         for (int i=0; i<result.length && keepRunning; i++) {\r
321                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
322                         }\r
323                         input.close();\r
324                 \r
325                         f.close();\r
326                 } catch (java.lang.ClassCastException e) {\r
327                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
328                 } catch (FileNotFoundException e) {\r
329                         // TODO Auto-generated catch block\r
330                         e.printStackTrace();\r
331                 } catch (IOException e) {\r
332                         // TODO Auto-generated catch block\r
333                         e.printStackTrace();\r
334                 } catch (SAXException e) {\r
335                         // TODO Auto-generated catch block\r
336                         e.printStackTrace();\r
337                 } catch (TikaException e) {\r
338                         // TODO Auto-generated catch block\r
339                         e.printStackTrace();\r
340                 }\r
341         }\r
342 \r
343         \r
344         private void indexResourceOffice(Resource r) {\r
345                 QTemporaryFile f = writeResource(r.getData());\r
346                 if (!keepRunning) {\r
347                         return;\r
348                 }\r
349                 \r
350                 InputStream input;\r
351                 try {\r
352                         input = new FileInputStream(new File(f.fileName()));\r
353                         ContentHandler textHandler = new BodyContentHandler();\r
354                         Metadata metadata = new Metadata();\r
355                         OfficeParser parser = new OfficeParser();       \r
356                         ParseContext context = new ParseContext();\r
357                         parser.parse(input, textHandler, metadata, context);\r
358                         String[] result = textHandler.toString().split(regex);\r
359                         for (int i=0; i<result.length && keepRunning; i++) {\r
360                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
361                         }\r
362                         input.close();\r
363                 \r
364                         f.close();\r
365                 } catch (java.lang.ClassCastException e) {\r
366                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
367                 } catch (FileNotFoundException e) {\r
368                         // TODO Auto-generated catch block\r
369                         e.printStackTrace();\r
370                 } catch (IOException e) {\r
371                         // TODO Auto-generated catch block\r
372                         e.printStackTrace();\r
373                 } catch (SAXException e) {\r
374                         // TODO Auto-generated catch block\r
375                         e.printStackTrace();\r
376                 } catch (TikaException e) {\r
377                         // TODO Auto-generated catch block\r
378                         e.printStackTrace();\r
379                 }\r
380         }\r
381 \r
382         \r
383         \r
384         private void indexResourcePDF(Resource r) {\r
385                 QTemporaryFile f = writeResource(r.getData());\r
386                 if (!keepRunning) {\r
387                         return;\r
388                 }\r
389                 \r
390                 InputStream input;\r
391                 try {\r
392                         input = new FileInputStream(new File(f.fileName()));\r
393                         ContentHandler textHandler = new BodyContentHandler();\r
394                         Metadata metadata = new Metadata();\r
395                         PDFParser parser = new PDFParser();     \r
396                         ParseContext context = new ParseContext();\r
397                         parser.parse(input, textHandler, metadata, context);\r
398                         String[] result = textHandler.toString().split(regex);\r
399                         for (int i=0; i<result.length && keepRunning; i++) {\r
400                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
401                         }\r
402                         input.close();\r
403                 \r
404                         f.close();\r
405                 } catch (java.lang.ClassCastException e) {\r
406                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
407                 } catch (FileNotFoundException e) {\r
408                         // TODO Auto-generated catch block\r
409                         e.printStackTrace();\r
410                 } catch (IOException e) {\r
411                         // TODO Auto-generated catch block\r
412                         e.printStackTrace();\r
413                 } catch (SAXException e) {\r
414                         // TODO Auto-generated catch block\r
415                         e.printStackTrace();\r
416                 } catch (TikaException e) {\r
417                         // TODO Auto-generated catch block\r
418                         e.printStackTrace();\r
419                 }\r
420         }\r
421         \r
422         \r
423         private void indexResourceOOXML(Resource r) {\r
424                 QTemporaryFile f = writeResource(r.getData());\r
425                 if (!keepRunning) {\r
426                         return;\r
427                 }\r
428                 \r
429                 InputStream input;\r
430                 try {\r
431                         input = new FileInputStream(new File(f.fileName()));\r
432                         ContentHandler textHandler = new BodyContentHandler();\r
433                         Metadata metadata = new Metadata();\r
434                         OOXMLParser parser = new OOXMLParser(); \r
435                         ParseContext context = new ParseContext();\r
436                         parser.parse(input, textHandler, metadata, context);\r
437                         String[] result = textHandler.toString().split(regex);\r
438                         for (int i=0; i<result.length && keepRunning; i++) {\r
439                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
440                         }\r
441                         input.close();\r
442                 \r
443                         f.close();\r
444                 } catch (java.lang.ClassCastException e) {\r
445                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
446                 } catch (FileNotFoundException e) {\r
447                         // TODO Auto-generated catch block\r
448                         e.printStackTrace();\r
449                 } catch (IOException e) {\r
450                         // TODO Auto-generated catch block\r
451                         e.printStackTrace();\r
452                 } catch (SAXException e) {\r
453                         // TODO Auto-generated catch block\r
454                         e.printStackTrace();\r
455                 } catch (TikaException e) {\r
456                         // TODO Auto-generated catch block\r
457                         e.printStackTrace();\r
458                 }\r
459         }\r
460         \r
461 \r
462         \r
463         private QTemporaryFile writeResource(Data d) {\r
464                 QTemporaryFile newFile = new QTemporaryFile();\r
465                 newFile.open(OpenModeFlag.WriteOnly);\r
466                 newFile.write(d.getBody());\r
467                 newFile.close();\r
468                 return newFile;\r
469         }\r
470 \r
471         \r
472         private String removeEnCrypt(String content) {\r
473                 int index = content.indexOf("<en-crypt");\r
474                 int endPos;\r
475                 boolean tagFound = true;\r
476                 while (tagFound && keepRunning) {\r
477                         endPos = content.indexOf("</en-crypt>", index)+11;\r
478                         if (endPos > -1 && index > -1) {\r
479                                 content = content.substring(0,index)+content.substring(endPos);\r
480                                 index = content.indexOf("<en-crypt");\r
481                         } else {\r
482                                 tagFound = false;\r
483                         }\r
484                 }\r
485                 return content;\r
486         }\r
487 \r
488         \r
489         private void addToIndex(String guid, String word, String type) {\r
490                 if (word.length() > 0) {\r
491                         // We have a good word, now let's trim off junk at the beginning or end\r
492                         StringBuffer buffer = new StringBuffer(word.toLowerCase());\r
493                         for (int x = buffer.length()-1; x>=0; x--) {\r
494                                 if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
495                                         buffer = buffer.deleteCharAt(x);\r
496                                 else\r
497                                         x=-1;\r
498                         }\r
499                         // Things have been trimmed off the end, so reverse the string & repeat.\r
500                         buffer = buffer.reverse();\r
501                         for (int x = buffer.length()-1; x>=0; x--) {\r
502                                 if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
503                                         buffer = buffer.deleteCharAt(x);\r
504                                 else\r
505                                         x=-1;\r
506                         }\r
507                         // Restore the string back to the proper order.\r
508                         buffer = buffer.reverse();\r
509                 \r
510                         logger.log(logger.EXTREME, "Processing " +buffer);\r
511                         if (buffer.length()>=Global.minimumWordCount) {\r
512                                 logger.log(logger.EXTREME, "Adding " +buffer);\r
513                                 conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), type, 100);\r
514                         }\r
515                 }\r
516         }\r
517         \r
518 \r
519 }\r