2 * This file is part of NeverNote
\r
3 * Copyright 2009 Randy Baumgarte
\r
5 * This file may be licensed under the terms of of the
\r
6 * GNU General Public License Version 2 (the ``GPL'').
\r
8 * Software distributed under the License is distributed
\r
9 * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
\r
10 * express or implied. See the GPL for the specific language
\r
11 * governing rights and limitations.
\r
13 * You should have received a copy of the GPL along with this
\r
14 * program. If not, go to http://www.gnu.org/licenses/gpl.html
\r
15 * or write to the Free Software Foundation, Inc.,
\r
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
\r
20 package cx.fbn.nevernote.threads;
\r
22 import java.io.ByteArrayInputStream;
\r
23 import java.io.ByteArrayOutputStream;
\r
24 import java.util.concurrent.LinkedBlockingQueue;
\r
26 import org.apache.commons.lang.StringEscapeUtils;
\r
27 import org.w3c.tidy.Tidy;
\r
29 import com.evernote.edam.type.Note;
\r
30 import com.evernote.edam.type.Resource;
\r
31 import com.trolltech.qt.core.QByteArray;
\r
32 import com.trolltech.qt.core.QObject;
\r
33 import com.trolltech.qt.xml.QDomDocument;
\r
34 import com.trolltech.qt.xml.QDomElement;
\r
35 import com.trolltech.qt.xml.QDomNodeList;
\r
37 import cx.fbn.nevernote.Global;
\r
38 import cx.fbn.nevernote.signals.NoteResourceSignal;
\r
39 import cx.fbn.nevernote.signals.NoteSignal;
\r
40 import cx.fbn.nevernote.sql.DatabaseConnection;
\r
41 import cx.fbn.nevernote.utilities.ApplicationLogger;
\r
43 public class IndexRunner extends QObject implements Runnable {
\r
45 private final ApplicationLogger logger;
\r
46 private String guid;
\r
47 private QByteArray resourceBinary;
\r
48 public volatile NoteSignal noteSignal;
\r
49 public volatile NoteResourceSignal resourceSignal;
\r
50 private int indexType;
\r
51 public final int CONTENT=1;
\r
52 public final int RESOURCE=2;
\r
53 private boolean keepRunning;
\r
54 private final QDomDocument doc;
\r
55 private static String regex = Global.getWordRegex();
\r
56 private final DatabaseConnection conn;
\r
57 private volatile LinkedBlockingQueue<String> workQueue;
\r
58 private static int MAX_QUEUED_WAITING = 1000;
\r
63 public IndexRunner(String logname, String u, String uid, String pswd, String cpswd) {
\r
64 logger = new ApplicationLogger(logname);
\r
65 conn = new DatabaseConnection(logger, u, uid, pswd, cpswd);
\r
66 noteSignal = new NoteSignal();
\r
67 resourceSignal = new NoteResourceSignal();
\r
68 indexType = CONTENT;
\r
71 doc = new QDomDocument();
\r
72 workQueue=new LinkedBlockingQueue<String>(MAX_QUEUED_WAITING);
\r
75 public void setIndexType(int t) {
\r
82 thread().setPriority(Thread.MIN_PRIORITY);
\r
83 logger.log(logger.EXTREME, "Starting index thread ");
\r
84 while (keepRunning) {
\r
86 String work = workQueue.take();
\r
87 if (work.startsWith("CONTENT")) {
\r
88 work = work.replace("CONTENT ", "");
\r
90 indexType = CONTENT;
\r
92 if (work.startsWith("RESOURCE")) {
\r
93 work = work.replace("RESOURCE ", "");
\r
95 indexType = RESOURCE;
\r
97 if (work.startsWith("STOP")) {
\r
98 keepRunning = false;
\r
101 if (guid == null || guid.trim().equals("")) {
\r
103 resourceSignal.resourceIndexed.emit("null or empty guid");
\r
105 logger.log(logger.EXTREME, "Type:" +indexType);
\r
106 if (indexType == CONTENT && keepRunning) {
\r
107 logger.log(logger.MEDIUM, "Indexing note: "+guid);
\r
108 indexNoteContent();
\r
111 if (indexType == RESOURCE && keepRunning) {
\r
112 logger.log(logger.MEDIUM, "Indexing resource: "+guid);
\r
116 } catch (InterruptedException e) {
\r
117 // TODO Auto-generated catch block
\r
118 e.printStackTrace();
\r
125 public void indexNoteContent() {
\r
126 logger.log(logger.EXTREME, "Entering indexRunner.indexNoteContent()");
\r
128 logger.log(logger.EXTREME, "Getting note content");
\r
129 Note n = conn.getNoteTable().getNote(guid,true,false,true,true, true);
\r
130 String data = n.getContent();
\r
132 logger.log(logger.EXTREME, "Removing any encrypted data");
\r
133 data = removeEnCrypt(data);
\r
134 logger.log(logger.EXTREME, "Removing xml markups");
\r
135 // These HTML characters need to be replaced by a space, or they'll cause words to jam together
\r
136 // data = data.toLowerCase().replace("<br>", " ").replace("<hr>", " ").replace("<p>", " ").replace("<href>", " ");
\r
137 // String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", ""));
\r
138 Tidy tidy = new Tidy();
\r
139 tidy.getStderr().close(); // the listener will capture messages
\r
140 tidy.setXmlTags(true);
\r
141 byte html[] = data.getBytes();
\r
142 ByteArrayInputStream is = new ByteArrayInputStream(html);
\r
143 ByteArrayOutputStream os = new ByteArrayOutputStream();
\r
144 tidy.parse(is, os);
\r
145 String text = StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", ""));
\r
147 logger.log(logger.EXTREME, "Splitting words");
\r
148 String[] result = text.toString().split(regex);
\r
149 logger.log(logger.EXTREME, "Deleting existing words for note from index");
\r
150 conn.getWordsTable().expungeFromWordIndex(guid, "CONTENT");
\r
152 logger.log(logger.EXTREME, "Number of words found: " +result.length);
\r
153 for (int j=0; j<result.length && keepRunning; j++) {
\r
154 logger.log(logger.EXTREME, "Result word: " +result[j]);
\r
155 if (result[j].length() > 0) {
\r
156 // We have a good word, now let's trim off junk at the beginning or end
\r
157 StringBuffer buffer = new StringBuffer(result[j].toLowerCase());
\r
158 for (int x = buffer.length()-1; x>=0; x--) {
\r
159 if (!Character.isLetterOrDigit(buffer.charAt(x)))
\r
160 buffer = buffer.deleteCharAt(x);
\r
164 // Things have been trimmed off the end, so reverse the string & repeat.
\r
165 buffer = buffer.reverse();
\r
166 for (int x = buffer.length()-1; x>=0; x--) {
\r
167 if (!Character.isLetterOrDigit(buffer.charAt(x)))
\r
168 buffer = buffer.deleteCharAt(x);
\r
172 // Restore the string back to the proper order.
\r
173 buffer = buffer.reverse();
\r
175 logger.log(logger.EXTREME, "Processing " +buffer);
\r
176 if (buffer.length()>=Global.minimumWordCount) {
\r
177 logger.log(logger.EXTREME, "Adding " +buffer);
\r
178 conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), "CONTENT", 100);
\r
182 // If we were interrupted, we will reindex this note next time
\r
183 if (Global.keepRunning) {
\r
184 logger.log(logger.EXTREME, "Resetting note guid needed");
\r
185 conn.getNoteTable().setIndexNeeded(guid, false);
\r
187 logger.log(logger.EXTREME, "Leaving indexRunner.indexNoteContent()");
\r
191 public synchronized boolean addWork(String request) {
\r
192 if (workQueue.size() == 0) {
\r
193 workQueue.offer(request);
\r
199 public synchronized int getWorkQueueSize() {
\r
200 return workQueue.size();
\r
203 public void indexResource() {
\r
208 Resource r = conn.getNoteTable().noteResourceTable.getNoteResourceRecognition(guid);
\r
209 if (r == null || r.getRecognition() == null || r.getRecognition().getBody() == null || r.getRecognition().getBody().length == 0)
\r
210 resourceBinary = new QByteArray(" ");
\r
212 resourceBinary = new QByteArray(r.getRecognition().getBody());
\r
214 conn.getWordsTable().expungeFromWordIndex(guid, "RESOURCE");
\r
216 doc.setContent(resourceBinary);
\r
217 QDomElement docElem = doc.documentElement();
\r
219 // look for text tags
\r
220 QDomNodeList anchors = docElem.elementsByTagName("t");
\r
221 for (int i=0; i<anchors.length() && keepRunning; i++) {
\r
222 QDomElement enmedia = anchors.at(i).toElement();
\r
223 String weight = new String(enmedia.attribute("w"));
\r
224 String text = new String(enmedia.text()).toLowerCase();
\r
225 if (!text.equals("")) {
\r
226 conn.getWordsTable().addWordToNoteIndex(guid, text, "RESOURCE", new Integer(weight));
\r
229 if (Global.keepRunning)
\r
230 conn.getNoteTable().noteResourceTable.setIndexNeeded(guid,false);
\r
234 private String removeEnCrypt(String content) {
\r
235 int index = content.indexOf("<en-crypt");
\r
237 boolean tagFound = true;
\r
238 while (tagFound && keepRunning) {
\r
239 endPos = content.indexOf("</en-crypt>", index)+11;
\r
240 if (endPos > -1 && index > -1) {
\r
241 content = content.substring(0,index)+content.substring(endPos);
\r
242 index = content.indexOf("<en-crypt");
\r