2 * This file is part of NeverNote
\r
3 * Copyright 2009 Randy Baumgarte
\r
5 * This file may be licensed under the terms of of the
\r
6 * GNU General Public License Version 2 (the ``GPL'').
\r
8 * Software distributed under the License is distributed
\r
9 * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
\r
10 * express or implied. See the GPL for the specific language
\r
11 * governing rights and limitations.
\r
13 * You should have received a copy of the GPL along with this
\r
14 * program. If not, go to http://www.gnu.org/licenses/gpl.html
\r
15 * or write to the Free Software Foundation, Inc.,
\r
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
\r
19 package cx.fbn.nevernote.evernote;
\r
21 //**********************************************
\r
22 //**********************************************
\r
23 //* This is used to turn HTML into ENML compliant
\r
25 //**********************************************
\r
26 //**********************************************
\r
28 import java.io.ByteArrayInputStream;
\r
29 import java.io.ByteArrayOutputStream;
\r
30 import java.io.File;
\r
31 import java.util.ArrayList;
\r
32 import java.util.List;
\r
34 import org.w3c.tidy.Tidy;
\r
35 import org.w3c.tidy.TidyMessage;
\r
37 import com.trolltech.qt.core.QByteArray;
\r
38 import com.trolltech.qt.core.QTextCodec;
\r
40 import cx.fbn.nevernote.Global;
\r
41 import cx.fbn.nevernote.utilities.ApplicationLogger;
\r
42 import cx.fbn.nevernote.utilities.Pair;
\r
43 import cx.fbn.nevernote.xml.XMLCleanup;
\r
44 import cx.fbn.nevernote.xml.XMLNoteRepair;
\r
46 public class EnmlConverter {
\r
47 private final ApplicationLogger logger;
\r
48 private List<String> resources;
\r
49 public boolean saveInvalidXML;
\r
51 private class TidyListener implements org.w3c.tidy.TidyMessageListener {
\r
53 ApplicationLogger logger;
\r
54 public boolean errorFound;
\r
56 public TidyListener(ApplicationLogger logger) {
\r
57 this.logger = logger;
\r
61 public void messageReceived(TidyMessage msg) {
\r
62 if (msg.getLevel() == TidyMessage.Level.ERROR) {
\r
63 logger.log(logger.LOW, "******* JTIDY ERORR *******");
\r
64 logger.log(logger.LOW, "Error Code: " +msg.getErrorCode());
\r
65 logger.log(logger.LOW, "Column: " +msg.getColumn());
\r
66 logger.log(logger.LOW, "Column: " +msg.getColumn());
\r
67 logger.log(logger.LOW, "Line: " +msg.getLine());
\r
68 logger.log(logger.LOW, "Message: " +msg.getMessage());
\r
69 logger.log(logger.LOW, "***************************");
\r
72 logger.log(logger.EXTREME, "JTidy Results: "+msg.getMessage());
\r
77 public EnmlConverter(ApplicationLogger l) {
\r
80 saveInvalidXML = false;
\r
81 resources = new ArrayList<String>();
\r
84 public List<String> getResources() {
\r
87 public String convert(String noteGuid, String content) {
\r
88 logger.log(logger.HIGH, "Entering DBRunner.convertToEnml");
\r
89 logger.log(logger.EXTREME, "Note Text:" +content);
\r
91 // Replace the en-note tags with body tags in case we came from
\r
92 // someplace other than the editor (for example, if we are merging notes).
\r
93 content = content.replace("<en-note>", "<body>");
\r
94 content = content.replace("</en-note>", "</body>");
\r
95 // Start removing stuff we don't need or want
\r
96 int br = content.lastIndexOf("</body>");
\r
98 content = new String(content.substring(0,br));
\r
100 int k = content.indexOf("<body");
\r
102 newContent = new String(content.substring(k));
\r
104 newContent = "<body>"+content;
\r
107 // Check that we have a vaild header. Normally we should not
\r
108 // but sometimes it seems that we can. I don't see how, but it is
\r
109 // easy enough to check.
\r
110 if (!newContent.startsWith("<?xml"))
\r
111 newContent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
\r
112 +"<!DOCTYPE en-note SYSTEM \"http://xml.evernote.com/pub/enml2.dtd\">\n"
\r
117 // Fix the more common XML problems that Webkit creates, but are not considered
\r
119 newContent = fixStupidXMLProblems(newContent);
\r
122 // Change the contents to have enml instead of body tags or
\r
123 // we'll fail validation later.
\r
124 newContent = newContent.replace("<body", "<en-note");
\r
125 newContent = newContent.replace("</body>", "</en-note>");
\r
127 // First pass through the data. The goal of this pass is to
\r
128 // validate that we have a good XML document and to repair
\r
129 // any problems found.
\r
131 XMLNoteRepair repair = new XMLNoteRepair();
\r
132 // logger.log(logger.HIGH, "Checking XML Structure");
\r
133 // newContent = repair.parse(newContent, false);
\r
134 // logger.log(logger.HIGH, "Check complete");
\r
136 logger.log(logger.HIGH, "Fixing encryption tags");
\r
137 newContent = fixEncryptionTags(newContent);
\r
139 Tidy tidy = new Tidy();
\r
140 TidyListener tidyListener = new TidyListener(logger);
\r
141 tidy.setMessageListener(tidyListener);
\r
142 tidy.getStderr().close(); // the listener will capture messages
\r
143 tidy.setXmlTags(true);
\r
146 codec = QTextCodec.codecForName("UTF-8");
\r
147 QByteArray unicode = codec.fromUnicode(newContent);
\r
149 // byte html[] = newContent.getBytes();
\r
150 // ByteArrayInputStream is = new ByteArrayInputStream(html);
\r
151 logger.log(logger.HIGH, "Starting JTidy check");
\r
152 logger.log(logger.EXTREME, "Start of JTidy Input");
\r
153 logger.log(logger.EXTREME, newContent);
\r
154 logger.log(logger.EXTREME, "End Of JTidy Input");
\r
155 ByteArrayInputStream is = new ByteArrayInputStream(unicode.toByteArray());
\r
156 ByteArrayOutputStream os = new ByteArrayOutputStream();
\r
157 tidy.setInputEncoding("UTF-8");
\r
158 tidy.parse(is, os);
\r
159 String tidyContent = os.toString();
\r
160 if (tidyListener.errorFound) {
\r
161 logger.log(logger.LOW, "Note Contents Begin");
\r
162 logger.log(logger.LOW, content);
\r
163 logger.log(logger.LOW, "Note Contents End");
\r
164 tidyContent = null;
\r
166 if (newContent.trim().equals(""))
\r
167 tidyContent = null;
\r
170 // If the repair above returned null, then the XML is foobar.
\r
171 // We are done here.
\r
172 if (tidyContent != null) {
\r
173 newContent = tidyContent;
\r
175 // Houston, we've had a problem. Fall back to old method
\r
176 logger.log(logger.HIGH, "Error converting to JTidy. Falling back to old method");
\r
177 String repairedContent = repair.parse(newContent, false);
\r
178 if (repairedContent == null) {
\r
179 logger.log(logger.EXTREME, "Null returned from repair.parse()");
\r
180 logger.log(logger.LOW, "Parse error when converting to ENML. Aborting save");
\r
183 newContent = repairedContent;
\r
184 logger.log(logger.EXTREME, "Start of repaired content");
\r
185 logger.log(logger.EXTREME, repairedContent);
\r
186 logger.log(logger.EXTREME, "End of repaired content");
\r
189 // Second pass through the data. The goal of this pass is to
\r
190 // remove any things we added in NeverNote that do not match
\r
192 XMLCleanup v = new XMLCleanup();
\r
193 v.setValue(newContent);
\r
194 logger.log(logger.HIGH, "Beginning ENML Cleanup");
\r
196 logger.log(logger.HIGH, "Cleanup complete.");
\r
200 // Final pass through the data. In this one we
\r
201 // remove any invalid attributes and to save the
\r
203 logger.log(logger.EXTREME, "Rebuilt ENML:");
\r
204 logger.log(logger.EXTREME, v.getValue());
\r
205 logger.log(logger.EXTREME, "End Of Rebuilt ENML:");
\r
206 resources = v.getResources();
\r
209 // The XML has the dtd to validate set against Evernote's web
\r
210 // address. We change it to a local one because otherwise it would
\r
211 // fail if the user doesn't have internet connectivity. The local copy
\r
212 // also contains the 3 other PUBLIC definitions at the beginning of the dtd.
\r
213 newContent = v.getValue();
\r
214 File dtdFile = Global.getFileManager().getXMLDirFile("enml2.dtd");
\r
215 String dtd = dtdFile.toURI().toString();
\r
216 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \'http://xml.evernote.com/pub/enml2.dtd'>",
\r
217 "<!DOCTYPE en-note SYSTEM \"" +dtd +"\">");
\r
219 logger.log(logger.HIGH, "Validating ENML");
\r
220 String repairedContent = repair.parse(newContent, true);
\r
221 if (repairedContent == null)
\r
222 logger.log(logger.EXTREME, "Null returned from repair.parse()");
\r
224 newContent = repairedContent;
\r
225 logger.log(logger.HIGH, "Validation complete");
\r
226 saveInvalidXML = repair.saveInvalidXML;
\r
228 // Restore the correct XML header.
\r
229 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \"" +dtd +"\">",
\r
230 "<!DOCTYPE en-note SYSTEM 'http://xml.evernote.com/pub/enml2.dtd'>");
\r
233 logger.log(logger.EXTREME, "Leaving ENMLConverter.convert()");
\r
238 private String fixEncryptionTags(String content) {
\r
239 // Fix the problem that the document body isn't properly closed
\r
240 String newContent = new String(content);
\r
241 logger.log(logger.MEDIUM, "Inside EnmlConverter.fixEncryptionTags");
\r
242 logger.log(logger.EXTREME, content);
\r
244 // Fix the problem that the img tag isn't properly closed
\r
245 int endPos, startPos, endData,slotStart, slotEnd;
\r
246 logger.log(logger.MEDIUM, "Checking table encryption tags");
\r
247 String eTag = "<table class=\"en-crypt-temp\"";
\r
248 for (int i=newContent.indexOf(eTag); i>0; i = newContent.indexOf(eTag,i+1)) {
\r
249 slotStart = newContent.indexOf("slot", i+1)+6;
\r
250 slotEnd = newContent.indexOf("\"",slotStart);
\r
251 String slot = newContent.substring(slotStart, slotEnd);
\r
252 startPos = newContent.indexOf("<td>", i+1)+4;
\r
253 endData = newContent.indexOf("</td>",startPos);
\r
254 String text = newContent.substring(startPos,endData);
\r
255 endPos = newContent.indexOf("</table>",i+1)+8;
\r
256 // Encrypt the text
\r
257 Pair<String,String> pair = Global.passwordSafe.get(slot);
\r
258 String password = pair.getFirst();
\r
259 String hint = pair.getSecond();
\r
260 EnCrypt crypt = new EnCrypt();
\r
261 String encrypted = crypt.encrypt(text, password, 64);
\r
263 // replace the table with an en-crypt tag.
\r
264 newContent = newContent.substring(0,i-1) +
\r
265 "<en-crypt-temp cipher=\"RC2\" length=\"64\" hint=\""+
\r
266 hint +"\" value=\""+
\r
269 newContent.substring(endPos);
\r
275 // Fix XML problems that Qt can't deal with
\r
276 public String fixStupidXMLProblems(String content) {
\r
277 logger.log(logger.HIGH, "Entering DBRunner.fixStupidXMLProblems");
\r
279 // Fix the problem that the document body isn't properly closed
\r
280 String newContent = new String(content);
\r
281 logger.log(logger.MEDIUM, "Inside fixStupidXMLProblems. Old content:");
\r
282 logger.log(logger.EXTREME, content);
\r
284 // Fix the problem that the img tag isn't properly closed
\r
286 logger.log(logger.MEDIUM, "Checking img tags");
\r
287 for (int i=newContent.indexOf("<img"); i>0; i = newContent.indexOf("<img",i+1)) {
\r
288 endPos = newContent.indexOf(">",i+1);
\r
289 String end = newContent.substring(endPos+1);
\r
290 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
293 // Fix the problem that the input tag isn't properly closed
\r
294 logger.log(logger.MEDIUM, "Checking input tags");
\r
295 for (int i=newContent.indexOf("<input"); i>0; i = newContent.indexOf("<input",i+1)) {
\r
296 endPos = newContent.indexOf(">",i+1);
\r
297 String end = newContent.substring(endPos+1);
\r
298 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
302 // Fix the problem that the <br> tag isn't properly closed
\r
303 logger.log(logger.MEDIUM, "Checking br tags");
\r
304 for (int i=newContent.indexOf("<br"); i>0; i = newContent.indexOf("<br",i+1)) {
\r
305 endPos = newContent.indexOf(">",i+1);
\r
306 String end = newContent.substring(endPos+1);
\r
307 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
310 // Fix the problem that the <hr> tag isn't properly closed
\r
311 logger.log(logger.MEDIUM, "Checking hr tags");
\r
312 for (int i=newContent.indexOf("<hr"); i>0; i = newContent.indexOf("<hr",i+1)) {
\r
313 endPos = newContent.indexOf(">",i+1);
\r
314 String end = newContent.substring(endPos+1);
\r
315 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
318 logger.log(logger.MEDIUM, "Leaving fixStupidXMLProblems");
\r
319 logger.log(logger.HIGH, "Leaving DBRunner.fixStupidXMLProblems");
\r
320 return newContent.toString();
\r
324 // Fix XML that Evernote thinks is invalid
\r
325 public String fixEnXMLCrap(String note) {
\r
326 logger.log(logger.EXTREME, "Entering EnmlConverter.fixEnXMLCrap");
\r
331 StringBuffer buffer = new StringBuffer(note);
\r
333 logger.log(logger.EXTREME, "Converting <b/>");
\r
334 // change all <b/> to <b></b> because Evernote hates them if they happen in <span>
\r
335 pos = buffer.indexOf("<b/>");
\r
337 buffer.replace(pos, pos+4, "<b></b>");
\r
338 pos = buffer.indexOf("<b/>",pos);
\r
340 // change all <br/> to <br></br> because Evernote hates them if they happen in <span>
\r
341 logger.log(logger.EXTREME, "converting <br/>");
\r
342 pos = buffer.indexOf("<br/>");
\r
344 buffer.replace(pos, pos+5, "<br></br>");
\r
345 pos = buffer.indexOf("<br/>",pos);
\r
348 // change all <span> elements in lists because Evernote hates them if they happen
\r
351 pos = buffer.indexOf("<li>");
\r
352 spanPos = buffer.indexOf("<span>");
\r
353 // Get rid of empty spans in <li> elements
\r
354 pos = buffer.indexOf("<li>");
\r
355 spanPos = buffer.indexOf("<span/>");
\r
356 for (; pos>-1 && spanPos >-1;) {
\r
357 endPos = buffer.indexOf("</li>",pos);
\r
358 if (spanPos > pos && spanPos < endPos) {
\r
359 buffer.replace(spanPos,spanPos+7,"");
\r
361 pos=buffer.indexOf("<li>",pos+1);
\r
362 spanPos = buffer.indexOf("<span/>",spanPos);
\r
365 logger.log(logger.EXTREME, "Leaving EnmlConverter.fixEnXMLCrap");
\r
366 return buffer.toString();
\r
369 // Fix stupid en-media problems
\r
370 public String fixEnMediaCrap(String note) {
\r
374 StringBuffer buffer = new StringBuffer(note);
\r
375 // get rid of any </en-media> tags since they shouldn't exist.
\r
376 int pos = buffer.indexOf("</en-media>");
\r
378 buffer.replace(pos, pos+11, "");
\r
379 pos = buffer.indexOf("</en-media>",pos);
\r
383 // Make sure we have a proper /> ending the en-media tag
\r
384 pos = buffer.indexOf("<en-media");
\r
386 pos=buffer.indexOf(">", pos);
\r
387 if (!buffer.substring(pos-1,pos).equals("/"))
\r
388 buffer.replace(pos, pos+1, " />");
\r
389 pos = buffer.indexOf("<en-media",pos);
\r
392 return buffer.toString();
\r