/* * This file is part of NeverNote * Copyright 2009 Randy Baumgarte * * This file may be licensed under the terms of of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See the GPL for the specific language * governing rights and limitations. * * You should have received a copy of the GPL along with this * program. If not, go to http://www.gnu.org/licenses/gpl.html * or write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * */ package cx.fbn.nevernote.threads; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.concurrent.LinkedBlockingQueue; import org.apache.commons.lang.StringEscapeUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; import org.apache.tika.parser.odf.OpenDocumentParser; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.parser.rtf.RTFParser; import org.apache.tika.sax.BodyContentHandler; import org.w3c.tidy.Tidy; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import com.evernote.edam.type.Data; import com.evernote.edam.type.Note; import com.evernote.edam.type.Resource; import com.trolltech.qt.core.QByteArray; import com.trolltech.qt.core.QIODevice.OpenModeFlag; import com.trolltech.qt.core.QObject; import com.trolltech.qt.core.QTemporaryFile; import com.trolltech.qt.xml.QDomDocument; import com.trolltech.qt.xml.QDomElement; import com.trolltech.qt.xml.QDomNodeList; import cx.fbn.nevernote.Global; import cx.fbn.nevernote.signals.NoteResourceSignal; import cx.fbn.nevernote.signals.NoteSignal; import cx.fbn.nevernote.sql.DatabaseConnection; import cx.fbn.nevernote.utilities.ApplicationLogger; public class IndexRunner extends QObject implements Runnable { private final ApplicationLogger logger; private String guid; private QByteArray resourceBinary; public volatile NoteSignal noteSignal; public volatile NoteResourceSignal resourceSignal; private int indexType; public final int CONTENT=1; public final int RESOURCE=2; public boolean keepRunning; private final QDomDocument doc; private static String regex = Global.getWordRegex(); private final DatabaseConnection conn; private volatile LinkedBlockingQueue workQueue; private static int MAX_QUEUED_WAITING = 1000; public IndexRunner(String logname, String u, String uid, String pswd, String cpswd) { logger = new ApplicationLogger(logname); conn = new DatabaseConnection(logger, u, uid, pswd, cpswd); noteSignal = new NoteSignal(); resourceSignal = new NoteResourceSignal(); indexType = CONTENT; guid = null; keepRunning = true; doc = new QDomDocument(); workQueue=new LinkedBlockingQueue(MAX_QUEUED_WAITING); } public void setIndexType(int t) { indexType = t; } @Override public void run() { thread().setPriority(Thread.MIN_PRIORITY); logger.log(logger.EXTREME, "Starting index thread "); while (keepRunning) { try { String work = workQueue.take(); if (work.startsWith("CONTENT")) { work = work.replace("CONTENT ", ""); guid = work; indexType = CONTENT; } if (work.startsWith("RESOURCE")) { work = work.replace("RESOURCE ", ""); guid = work; indexType = RESOURCE; } if (work.startsWith("STOP")) { keepRunning = false; guid = work; } if (guid == null || guid.trim().equals("")) { setIndexType(0); resourceSignal.resourceIndexed.emit("null or empty guid"); } logger.log(logger.EXTREME, "Type:" +indexType); if (indexType == CONTENT && keepRunning) { logger.log(logger.MEDIUM, "Indexing note: "+guid); indexNoteContent(); setIndexType(0); } if (indexType == RESOURCE && keepRunning) { logger.log(logger.MEDIUM, "Indexing resource: "+guid); indexResource(); setIndexType(0); } } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } conn.dbShutdown(); } // Reindex a note public void indexNoteContent() { logger.log(logger.EXTREME, "Entering indexRunner.indexNoteContent()"); logger.log(logger.EXTREME, "Getting note content"); Note n = conn.getNoteTable().getNote(guid,true,false,true,true, true); String data = n.getContent(); logger.log(logger.EXTREME, "Removing any encrypted data"); data = removeEnCrypt(data); logger.log(logger.EXTREME, "Removing xml markups"); // These HTML characters need to be replaced by a space, or they'll cause words to jam together // data = data.toLowerCase().replace("
", " ").replace("
", " ").replace("

", " ").replace("", " "); // String text = StringEscapeUtils.unescapeHtml(data.replaceAll("\\<.*?\\>", "")); Tidy tidy = new Tidy(); tidy.getStderr().close(); // the listener will capture messages tidy.setXmlTags(true); byte html[] = data.getBytes(); ByteArrayInputStream is = new ByteArrayInputStream(html); ByteArrayOutputStream os = new ByteArrayOutputStream(); tidy.parse(is, os); String text = StringEscapeUtils.unescapeHtml(os.toString().replaceAll("\\<.*?\\>", "")) +" "+ n.getTitle(); logger.log(logger.EXTREME, "Splitting words"); String[] result = text.toString().split(regex); logger.log(logger.EXTREME, "Deleting existing words for note from index"); conn.getWordsTable().expungeFromWordIndex(guid, "CONTENT"); logger.log(logger.EXTREME, "Number of words found: " +result.length); for (int j=0; j", index)+11; if (endPos > -1 && index > -1) { content = content.substring(0,index)+content.substring(endPos); index = content.indexOf(" 0) { // We have a good word, now let's trim off junk at the beginning or end StringBuffer buffer = new StringBuffer(word.toLowerCase()); for (int x = buffer.length()-1; x>=0; x--) { if (!Character.isLetterOrDigit(buffer.charAt(x))) buffer = buffer.deleteCharAt(x); else x=-1; } // Things have been trimmed off the end, so reverse the string & repeat. buffer = buffer.reverse(); for (int x = buffer.length()-1; x>=0; x--) { if (!Character.isLetterOrDigit(buffer.charAt(x))) buffer = buffer.deleteCharAt(x); else x=-1; } // Restore the string back to the proper order. buffer = buffer.reverse(); logger.log(logger.EXTREME, "Processing " +buffer); if (buffer.length()>=Global.minimumWordCount) { logger.log(logger.EXTREME, "Adding " +buffer); conn.getWordsTable().addWordToNoteIndex(guid, buffer.toString(), type, 100); } } } }