OSDN Git Service

- Corrected various null pointer errors. - Corrected problem indexing large PDF docum...
[neighbornote/NeighborNote.git] / src / cx / fbn / nevernote / threads / IndexRunner.java
1 /*\r
2  * This file is part of NeverNote \r
3  * Copyright 2009 Randy Baumgarte\r
4  * \r
5  * This file may be licensed under the terms of of the\r
6  * GNU General Public License Version 2 (the ``GPL'').\r
7  *\r
8  * Software distributed under the License is distributed\r
9  * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either\r
10  * express or implied. See the GPL for the specific language\r
11  * governing rights and limitations.\r
12  *\r
13  * You should have received a copy of the GPL along with this\r
14  * program. If not, go to http://www.gnu.org/licenses/gpl.html\r
15  * or write to the Free Software Foundation, Inc.,\r
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\r
17  *\r
18 */\r
19 \r
20 package cx.fbn.nevernote.threads;\r
21 \r
22 import java.io.ByteArrayInputStream;\r
23 import java.io.ByteArrayOutputStream;\r
24 import java.io.File;\r
25 import java.io.FileInputStream;\r
26 import java.io.FileNotFoundException;\r
27 import java.io.IOException;\r
28 import java.io.InputStream;\r
29 import java.util.concurrent.LinkedBlockingQueue;\r
30 \r
31 import org.apache.commons.lang.StringEscapeUtils;\r
32 import org.apache.tika.exception.TikaException;\r
33 import org.apache.tika.metadata.Metadata;\r
34 import org.apache.tika.parser.ParseContext;\r
35 import org.apache.tika.parser.microsoft.OfficeParser;\r
36 import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;\r
37 import org.apache.tika.parser.odf.OpenDocumentParser;\r
38 import org.apache.tika.parser.pdf.PDFParser;\r
39 import org.apache.tika.parser.rtf.RTFParser;\r
40 import org.apache.tika.sax.BodyContentHandler;\r
41 import org.w3c.tidy.Tidy;\r
42 import org.xml.sax.ContentHandler;\r
43 import org.xml.sax.SAXException;\r
44 \r
45 import com.evernote.edam.type.Data;\r
46 import com.evernote.edam.type.Note;\r
47 import com.evernote.edam.type.Resource;\r
48 import com.trolltech.qt.core.QByteArray;\r
49 import com.trolltech.qt.core.QIODevice.OpenModeFlag;\r
50 import com.trolltech.qt.core.QObject;\r
51 import com.trolltech.qt.core.QTemporaryFile;\r
52 import com.trolltech.qt.xml.QDomDocument;\r
53 import com.trolltech.qt.xml.QDomElement;\r
54 import com.trolltech.qt.xml.QDomNodeList;\r
55 \r
56 import cx.fbn.nevernote.Global;\r
57 import cx.fbn.nevernote.signals.NoteResourceSignal;\r
58 import cx.fbn.nevernote.signals.NoteSignal;\r
59 import cx.fbn.nevernote.sql.DatabaseConnection;\r
60 import cx.fbn.nevernote.utilities.ApplicationLogger;\r
61 \r
62 public class IndexRunner extends QObject implements Runnable {\r
63         \r
64         private final ApplicationLogger         logger;\r
65         private String                                          guid;\r
66         private QByteArray                                      resourceBinary;\r
67         public volatile NoteSignal                      noteSignal;\r
68         public volatile NoteResourceSignal      resourceSignal;\r
69         private int                                                     indexType;\r
70         public final int                                        CONTENT=1; \r
71         public final int                                        RESOURCE=2;\r
72         public boolean                                          keepRunning;\r
73         private final QDomDocument                      doc;\r
74         private static String                           regex = Global.getWordRegex();\r
75         private final DatabaseConnection        conn;\r
76         private volatile LinkedBlockingQueue<String> workQueue;\r
77         private static int MAX_QUEUED_WAITING = 1000;\r
78 \r
79         \r
80 \r
81         \r
82         public IndexRunner(String logname, String u, String uid, String pswd, String cpswd) {\r
83                 logger = new ApplicationLogger(logname);\r
84                 conn = new DatabaseConnection(logger, u, uid, pswd, cpswd);\r
85                 noteSignal = new NoteSignal();\r
86                 resourceSignal = new NoteResourceSignal();\r
87                 indexType = CONTENT;\r
88                 guid = null;\r
89                 keepRunning = true;\r
90                 doc = new QDomDocument();\r
91                 workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING);  \r
92         }\r
93         \r
94         public void setIndexType(int t) {\r
95                 indexType = t;\r
96         }\r
97         \r
98         \r
99         @Override\r
100         public void run() {\r
101                 thread().setPriority(Thread.MIN_PRIORITY);\r
102                 logger.log(logger.EXTREME, "Starting index thread ");\r
103                 while (keepRunning) {\r
104                         try {\r
105                                 String work = workQueue.take();\r
106                                 if (work.startsWith("CONTENT")) {\r
107                                         work = work.replace("CONTENT ", "");\r
108                                         guid = work;\r
109                                         indexType = CONTENT;\r
110                                 }\r
111                                 if (work.startsWith("RESOURCE")) {\r
112                                         work = work.replace("RESOURCE ", "");\r
113                                         guid = work;\r
114                                         indexType = RESOURCE;\r
115                                 }\r
116                                 if (work.startsWith("STOP")) {\r
117                                         keepRunning = false;\r
118                                         guid = work;\r
119                                 }\r
120                                 if (guid == null || guid.trim().equals("")) {\r
121                                         setIndexType(0);\r
122                                         resourceSignal.resourceIndexed.emit("null or empty guid");\r
123                                 }\r
124                                 logger.log(logger.EXTREME, "Type:" +indexType);\r
125                                 if (indexType == CONTENT && keepRunning) {\r
126                                         logger.log(logger.MEDIUM, "Indexing note: "+guid);\r
127                                         indexNoteContent();\r
128                                         setIndexType(0);\r
129                                 }\r
130                                 if (indexType == RESOURCE && keepRunning) {\r
131                                         logger.log(logger.MEDIUM, "Indexing resource: "+guid);\r
132                                         indexResource();\r
133                                         setIndexType(0);\r
134                                 }\r
135                         } catch (InterruptedException e) {\r
136                                 // TODO Auto-generated catch block\r
137                                 e.printStackTrace();\r
138                         }\r
139                 }\r
140                 conn.dbShutdown();\r
141         }\r
142         \r
143         // Reindex a note\r
144         public void indexNoteContent() {\r
145                 logger.log(logger.EXTREME, "Entering indexRunner.indexNoteContent()");\r
146                 \r
147                 logger.log(logger.EXTREME, "Getting note content");\r
148                 Note n = conn.getNoteTable().getNote(guid,true,false,true,true, true);\r
149                 String data = n.getContent();\r
150                 \r
151                 logger.log(logger.EXTREME, "Removing any encrypted data");\r
152                 data = removeEnCrypt(data);\r
153                 logger.log(logger.EXTREME, "Removing xml markups");\r
154                 // These HTML characters need to be replaced by a space, or they'll cause words to jam together\r
155 //              data = data.toLowerCase().replace("<br>", " ").replace("<hr>", " ").replace("<p>", " ").replace("<href>", " ");\r
156 //              String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", ""));\r
157                 Tidy tidy = new Tidy();\r
158                 tidy.getStderr().close();  // the listener will capture messages\r
159                 tidy.setXmlTags(true);\r
160                 byte html[] = data.getBytes();\r
161                 ByteArrayInputStream is = new ByteArrayInputStream(html);\r
162                 ByteArrayOutputStream os = new ByteArrayOutputStream();\r
163                 tidy.parse(is, os);\r
164                 String text =  StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", "")) +" "+\r
165                 n.getTitle();\r
166                                 \r
167                 logger.log(logger.EXTREME, "Splitting words");\r
168                 String[] result = text.toString().split(regex);\r
169                 logger.log(logger.EXTREME, "Deleting existing words for note from index");\r
170                 conn.getWordsTable().expungeFromWordIndex(guid, "CONTENT");\r
171                 \r
172                 logger.log(logger.EXTREME, "Number of words found: " +result.length);\r
173                 for (int j=0; j<result.length && keepRunning; j++) {\r
174                         logger.log(logger.EXTREME, "Result word: " +result[j]);\r
175                         addToIndex(guid, result[j], "CONTENT");\r
176                 }\r
177                 // If we were interrupted, we will reindex this note next time\r
178                 if (Global.keepRunning) {\r
179                         logger.log(logger.EXTREME, "Resetting note guid needed");\r
180                         conn.getNoteTable().setIndexNeeded(guid, false);\r
181                 }\r
182                 logger.log(logger.EXTREME, "Leaving indexRunner.indexNoteContent()");\r
183         }\r
184 \r
185         \r
186         public synchronized boolean addWork(String request) {\r
187                 if (workQueue.size() == 0) {\r
188                         workQueue.offer(request);\r
189                         return true;\r
190                 }\r
191                 return false;\r
192         }\r
193         \r
194         public synchronized int getWorkQueueSize() {\r
195                 return workQueue.size();\r
196         }\r
197         \r
198         public void indexResource() {\r
199                 \r
200                 if (guid == null)\r
201                         return;\r
202                 \r
203                 Resource r = conn.getNoteTable().noteResourceTable.getNoteResourceRecognition(guid);\r
204                 if (r == null || r.getRecognition() == null || r.getRecognition().getBody() == null || r.getRecognition().getBody().length == 0) \r
205                         resourceBinary = new QByteArray(" ");\r
206                 else\r
207                         resourceBinary = new QByteArray(r.getRecognition().getBody());\r
208                 \r
209                 conn.getWordsTable().expungeFromWordIndex(r.getNoteGuid(), "RESOURCE");\r
210                 // This is due to an old bug & can be removed at some point in the future 11/23/2010\r
211                 conn.getWordsTable().expungeFromWordIndex(guid, "RESOURCE");   \r
212                         \r
213                 doc.setContent(resourceBinary);\r
214                 QDomElement docElem = doc.documentElement();\r
215                         \r
216                 // look for text tags\r
217                 QDomNodeList anchors = docElem.elementsByTagName("t");\r
218                 for (int i=0; i<anchors.length() && keepRunning; i++) {\r
219                         QDomElement enmedia = anchors.at(i).toElement();\r
220                         String weight = new String(enmedia.attribute("w"));\r
221                         String text = new String(enmedia.text()).toLowerCase();\r
222                         if (!text.equals("")) {\r
223                                 conn.getWordsTable().addWordToNoteIndex(r.getNoteGuid(), text, "RESOURCE", new Integer(weight));\r
224                         }\r
225                 }\r
226                 \r
227                 if (Global.keepRunning) {\r
228                         indexResourceContent(guid);\r
229                 }\r
230                 \r
231                 if (Global.keepRunning)\r
232                         conn.getNoteTable().noteResourceTable.setIndexNeeded(guid,false);\r
233         }\r
234         \r
235         private void indexResourceContent(String guid) {\r
236                 Resource r = conn.getNoteTable().noteResourceTable.getNoteResource(guid, true);\r
237                 if (r.getMime().equalsIgnoreCase("application/pdf")) {\r
238                         indexResourcePDF(r);\r
239                         return;\r
240                 }\r
241                 if (r.getMime().equalsIgnoreCase("application/docx") || \r
242                         r.getMime().equalsIgnoreCase("application/xlsx") || \r
243                         r.getMime().equalsIgnoreCase("application/pptx")) {\r
244                         indexResourceOOXML(r);\r
245                         return;\r
246                 }\r
247                 if (r.getMime().equalsIgnoreCase("application/vsd") ||\r
248                         r.getMime().equalsIgnoreCase("application/ppt") ||\r
249                         r.getMime().equalsIgnoreCase("application/xls") ||\r
250                         r.getMime().equalsIgnoreCase("application/msg") ||\r
251                         r.getMime().equalsIgnoreCase("application/doc")) {\r
252                                 indexResourceOffice(r);\r
253                                 return;\r
254                 }\r
255                 if (r.getMime().equalsIgnoreCase("application/rtf")) {\r
256                                         indexResourceRTF(r);\r
257                                         return;\r
258                 }\r
259                 if (r.getMime().equalsIgnoreCase("application/odf") ||\r
260                         r.getMime().equalsIgnoreCase("application/odt") ||\r
261                         r.getMime().equalsIgnoreCase("application/odp") ||\r
262                         r.getMime().equalsIgnoreCase("application/odg") ||\r
263                         r.getMime().equalsIgnoreCase("application/odb") ||\r
264                         r.getMime().equalsIgnoreCase("application/ods")) {\r
265                         indexResourceODF(r);\r
266                         return;\r
267                 }\r
268         }\r
269 \r
270 \r
271         private void indexResourceRTF(Resource r) {\r
272                 QTemporaryFile f = writeResource(r.getData());\r
273                 if (!keepRunning) {\r
274                         return;\r
275                 }\r
276                 \r
277                 InputStream input;\r
278                 try {\r
279                         input = new FileInputStream(new File(f.fileName()));\r
280                         ContentHandler textHandler = new BodyContentHandler(-1);\r
281                         Metadata metadata = new Metadata();\r
282                         RTFParser parser = new RTFParser();     \r
283                         ParseContext context = new ParseContext();\r
284                         parser.parse(input, textHandler, metadata, context);\r
285                         String[] result = textHandler.toString().split(regex);\r
286                         for (int i=0; i<result.length && keepRunning; i++) {\r
287                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
288                         }\r
289                         input.close();\r
290                 \r
291                         f.close();\r
292                 } catch (java.lang.ClassCastException e) {\r
293                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
294                 } catch (FileNotFoundException e) {\r
295                         // TODO Auto-generated catch block\r
296                         e.printStackTrace();\r
297                 } catch (IOException e) {\r
298                         // TODO Auto-generated catch block\r
299                         e.printStackTrace();\r
300                 } catch (SAXException e) {\r
301                         // TODO Auto-generated catch block\r
302                         e.printStackTrace();\r
303                 } catch (TikaException e) {\r
304                         // TODO Auto-generated catch block\r
305                         e.printStackTrace();\r
306                 }\r
307         }\r
308 \r
309         \r
310         private void indexResourceODF(Resource r) {\r
311                 QTemporaryFile f = writeResource(r.getData());\r
312                 if (!keepRunning) {\r
313                         return;\r
314                 }\r
315                 \r
316                 InputStream input;\r
317                 try {\r
318                         input = new FileInputStream(new File(f.fileName()));\r
319                         ContentHandler textHandler = new BodyContentHandler(-1);\r
320                         Metadata metadata = new Metadata();\r
321                         OpenDocumentParser parser = new OpenDocumentParser();   \r
322                         ParseContext context = new ParseContext();\r
323                         parser.parse(input, textHandler, metadata, context);\r
324                         String[] result = textHandler.toString().split(regex);\r
325                         for (int i=0; i<result.length && keepRunning; i++) {\r
326                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
327                         }\r
328                         input.close();\r
329                 \r
330                         f.close();\r
331                 } catch (java.lang.ClassCastException e) {\r
332                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
333                 } catch (FileNotFoundException e) {\r
334                         // TODO Auto-generated catch block\r
335                         e.printStackTrace();\r
336                 } catch (IOException e) {\r
337                         // TODO Auto-generated catch block\r
338                         e.printStackTrace();\r
339                 } catch (SAXException e) {\r
340                         // TODO Auto-generated catch block\r
341                         e.printStackTrace();\r
342                 } catch (TikaException e) {\r
343                         // TODO Auto-generated catch block\r
344                         e.printStackTrace();\r
345                 }\r
346         }\r
347 \r
348         \r
349         private void indexResourceOffice(Resource r) {\r
350                 QTemporaryFile f = writeResource(r.getData());\r
351                 if (!keepRunning) {\r
352                         return;\r
353                 }\r
354                 \r
355                 InputStream input;\r
356                 try {\r
357                         input = new FileInputStream(new File(f.fileName()));\r
358                         ContentHandler textHandler = new BodyContentHandler(-1);\r
359                         Metadata metadata = new Metadata();\r
360                         OfficeParser parser = new OfficeParser();       \r
361                         ParseContext context = new ParseContext();\r
362                         parser.parse(input, textHandler, metadata, context);\r
363                         String[] result = textHandler.toString().split(regex);\r
364                         for (int i=0; i<result.length && keepRunning; i++) {\r
365                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
366                         }\r
367                         input.close();\r
368                 \r
369                         f.close();\r
370                 } catch (java.lang.ClassCastException e) {\r
371                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
372                 } catch (FileNotFoundException e) {\r
373                         // TODO Auto-generated catch block\r
374                         e.printStackTrace();\r
375                 } catch (IOException e) {\r
376                         // TODO Auto-generated catch block\r
377                         e.printStackTrace();\r
378                 } catch (SAXException e) {\r
379                         // TODO Auto-generated catch block\r
380                         e.printStackTrace();\r
381                 } catch (TikaException e) {\r
382                         // TODO Auto-generated catch block\r
383                         e.printStackTrace();\r
384                 }\r
385         }\r
386 \r
387         \r
388         \r
389         private void indexResourcePDF(Resource r) {\r
390                 QTemporaryFile f = writeResource(r.getData());\r
391                 if (!keepRunning) {\r
392                         return;\r
393                 }\r
394                 \r
395                 InputStream input;\r
396                 try {\r
397                         input = new FileInputStream(new File(f.fileName()));\r
398                         ContentHandler textHandler = new BodyContentHandler(-1);\r
399                         Metadata metadata = new Metadata();\r
400                         PDFParser parser = new PDFParser();     \r
401                         ParseContext context = new ParseContext();\r
402                         parser.parse(input, textHandler, metadata, context);\r
403                         String[] result = textHandler.toString().split(regex);\r
404                         for (int i=0; i<result.length && keepRunning; i++) {\r
405                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
406                         }\r
407                         input.close();\r
408                 \r
409                         f.close();\r
410                 } catch (java.lang.ClassCastException e) {\r
411                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
412                 } catch (FileNotFoundException e) {\r
413                         e.printStackTrace();\r
414                 } catch (IOException e) {\r
415                         e.printStackTrace();\r
416                 } catch (SAXException e) {\r
417                         e.printStackTrace();\r
418                 } catch (TikaException e) {\r
419                         e.printStackTrace();\r
420 //              } catch (Exception e) {\r
421 //                      e.printStackTrace();\r
422                 }\r
423         }\r
424         \r
425         \r
426         private void indexResourceOOXML(Resource r) {\r
427                 QTemporaryFile f = writeResource(r.getData());\r
428                 if (!keepRunning) {\r
429                         return;\r
430                 }\r
431                 \r
432                 InputStream input;\r
433                 try {\r
434                         input = new FileInputStream(new File(f.fileName()));\r
435                         ContentHandler textHandler = new BodyContentHandler(-1);\r
436                         Metadata metadata = new Metadata();\r
437                         OOXMLParser parser = new OOXMLParser(); \r
438                         ParseContext context = new ParseContext();\r
439                         parser.parse(input, textHandler, metadata, context);\r
440                         String[] result = textHandler.toString().split(regex);\r
441                         for (int i=0; i<result.length && keepRunning; i++) {\r
442                                 addToIndex(r.getNoteGuid(), result[i], "RESOURCE");\r
443                         }\r
444                         input.close();\r
445                 \r
446                         f.close();\r
447                 } catch (java.lang.ClassCastException e) {\r
448                         logger.log(logger.LOW, "Cast exception: " +e.getMessage());\r
449                 } catch (FileNotFoundException e) {\r
450                         // TODO Auto-generated catch block\r
451                         e.printStackTrace();\r
452                 } catch (IOException e) {\r
453                         // TODO Auto-generated catch block\r
454                         e.printStackTrace();\r
455                 } catch (SAXException e) {\r
456                         // TODO Auto-generated catch block\r
457                         e.printStackTrace();\r
458                 } catch (TikaException e) {\r
459                         // TODO Auto-generated catch block\r
460                         e.printStackTrace();\r
461                 }\r
462         }\r
463         \r
464 \r
465         \r
466         private QTemporaryFile writeResource(Data d) {\r
467                 QTemporaryFile newFile = new QTemporaryFile();\r
468                 newFile.open(OpenModeFlag.WriteOnly);\r
469                 newFile.write(d.getBody());\r
470                 newFile.close();\r
471                 return newFile;\r
472         }\r
473 \r
474         \r
475         private String removeEnCrypt(String content) {\r
476                 int index = content.indexOf("<en-crypt");\r
477                 int endPos;\r
478                 boolean tagFound = true;\r
479                 while (tagFound && keepRunning) {\r
480                         endPos = content.indexOf("</en-crypt>", index)+11;\r
481                         if (endPos > -1 && index > -1) {\r
482                                 content = content.substring(0,index)+content.substring(endPos);\r
483                                 index = content.indexOf("<en-crypt");\r
484                         } else {\r
485                                 tagFound = false;\r
486                         }\r
487                 }\r
488                 return content;\r
489         }\r
490 \r
491         \r
492         private void addToIndex(String guid, String word, String type) {\r
493                 if (word.length() > 0) {\r
494                         // We have a good word, now let's trim off junk at the beginning or end\r
495                         StringBuffer buffer = new StringBuffer(word.toLowerCase());\r
496                         for (int x = buffer.length()-1; x>=0; x--) {\r
497                                 if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
498                                         buffer = buffer.deleteCharAt(x);\r
499                                 else\r
500                                         x=-1;\r
501                         }\r
502                         // Things have been trimmed off the end, so reverse the string & repeat.\r
503                         buffer = buffer.reverse();\r
504                         for (int x = buffer.length()-1; x>=0; x--) {\r
505                                 if (!Character.isLetterOrDigit(buffer.charAt(x)))\r
506                                         buffer = buffer.deleteCharAt(x);\r
507                                 else\r
508                                         x=-1;\r
509                         }\r
510                         // Restore the string back to the proper order.\r
511                         buffer = buffer.reverse();\r
512                 \r
513                         logger.log(logger.EXTREME, "Processing " +buffer);\r
514                         if (buffer.length()>=Global.minimumWordCount) {\r
515                                 logger.log(logger.EXTREME, "Adding " +buffer);\r
516                                 conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), type, 100);\r
517                         }\r
518                 }\r
519         }\r
520         \r
521 \r
522 }\r