2 * This file is part of NeverNote
\r
3 * Copyright 2009 Randy Baumgarte
\r
5 * This file may be licensed under the terms of of the
\r
6 * GNU General Public License Version 2 (the ``GPL'').
\r
8 * Software distributed under the License is distributed
\r
9 * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
\r
10 * express or implied. See the GPL for the specific language
\r
11 * governing rights and limitations.
\r
13 * You should have received a copy of the GPL along with this
\r
14 * program. If not, go to http://www.gnu.org/licenses/gpl.html
\r
15 * or write to the Free Software Foundation, Inc.,
\r
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
\r
19 package cx.fbn.nevernote.evernote;
\r
21 import java.io.ByteArrayInputStream;
\r
22 import java.io.ByteArrayOutputStream;
\r
23 import java.io.File;
\r
24 import java.util.ArrayList;
\r
25 import java.util.List;
\r
27 import org.w3c.tidy.Tidy;
\r
28 import org.w3c.tidy.TidyMessage;
\r
30 import com.trolltech.qt.core.QByteArray;
\r
31 import com.trolltech.qt.core.QTextCodec;
\r
33 import cx.fbn.nevernote.Global;
\r
34 import cx.fbn.nevernote.utilities.ApplicationLogger;
\r
35 import cx.fbn.nevernote.xml.XMLCleanup;
\r
36 import cx.fbn.nevernote.xml.XMLNoteRepair;
\r
38 public class EnmlConverter {
\r
39 private final ApplicationLogger logger;
\r
40 private List<String> resources;
\r
41 public boolean saveInvalidXML;
\r
43 private class TidyListener implements org.w3c.tidy.TidyMessageListener {
\r
45 ApplicationLogger logger;
\r
46 public boolean errorFound;
\r
48 public TidyListener(ApplicationLogger logger) {
\r
49 this.logger = logger;
\r
53 public void messageReceived(TidyMessage msg) {
\r
54 if (msg.getLevel() == TidyMessage.Level.ERROR) {
\r
55 logger.log(logger.LOW, "******* JTIDY ERORR *******");
\r
56 logger.log(logger.LOW, "Error Code: " +msg.getErrorCode());
\r
57 logger.log(logger.LOW, "Column: " +msg.getColumn());
\r
58 logger.log(logger.LOW, "Column: " +msg.getColumn());
\r
59 logger.log(logger.LOW, "Line: " +msg.getLine());
\r
60 logger.log(logger.LOW, "Message: " +msg.getMessage());
\r
61 logger.log(logger.LOW, "***************************");
\r
64 logger.log(logger.EXTREME, "JTidy Results: "+msg.getMessage());
\r
69 public EnmlConverter(ApplicationLogger l) {
\r
72 saveInvalidXML = false;
\r
73 resources = new ArrayList<String>();
\r
76 public List<String> getResources() {
\r
79 public String convert(String noteGuid, String content) {
\r
80 logger.log(logger.HIGH, "Entering DBRunner.convertToEnml");
\r
81 logger.log(logger.EXTREME, "Note Text:" +content);
\r
83 // Replace the en-note tags with body tags in case we came from
\r
84 // someplace other than the editor (for example, if we are merging notes).
\r
85 content = content.replace("<en-note>", "<body>");
\r
86 content = content.replace("</en-note>", "</body>");
\r
87 // Start removing stuff we don't need or want
\r
88 int br = content.lastIndexOf("</body>");
\r
90 content = new String(content.substring(0,br));
\r
92 int k = content.indexOf("<body");
\r
94 newContent = new String(content.substring(k));
\r
96 newContent = "<body>"+content;
\r
99 // Check that we have a vaild header. Normally we should not
\r
100 // but sometimes it seems that we can. I don't see how, but it is
\r
101 // easy enough to check.
\r
102 if (!newContent.startsWith("<?xml"))
\r
103 newContent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
\r
104 +"<!DOCTYPE en-note SYSTEM \"http://xml.evernote.com/pub/enml2.dtd\">\n"
\r
109 // Fix the more common XML problems that Webkit creates, but are not considered
\r
111 newContent = fixStupidXMLProblems(newContent);
\r
114 // Change the contents to have enml instead of body tags or
\r
115 // we'll fail validation later.
\r
116 newContent = newContent.replace("<body", "<en-note");
\r
117 newContent = newContent.replace("</body>", "</en-note>");
\r
119 // First pass through the data. The goal of this pass is to
\r
120 // validate that we have a good XML document and to repair
\r
121 // any problems found.
\r
123 XMLNoteRepair repair = new XMLNoteRepair();
\r
124 // logger.log(logger.HIGH, "Checking XML Structure");
\r
125 // newContent = repair.parse(newContent, false);
\r
126 // logger.log(logger.HIGH, "Check complete");
\r
128 Tidy tidy = new Tidy();
\r
129 TidyListener tidyListener = new TidyListener(logger);
\r
130 tidy.setMessageListener(tidyListener);
\r
131 tidy.getStderr().close(); // the listener will capture messages
\r
132 tidy.setXmlTags(true);
\r
135 codec = QTextCodec.codecForName("UTF-8");
\r
136 QByteArray unicode = codec.fromUnicode(newContent);
\r
138 // byte html[] = newContent.getBytes();
\r
139 // ByteArrayInputStream is = new ByteArrayInputStream(html);
\r
141 ByteArrayInputStream is = new ByteArrayInputStream(unicode.toByteArray());
\r
142 ByteArrayOutputStream os = new ByteArrayOutputStream();
\r
143 tidy.setInputEncoding("UTF-8");
\r
144 // tidy.setOutputEncoding("UTF-8");
\r
145 tidy.parse(is, os);
\r
146 newContent = os.toString();
\r
147 // newContent = new QByteArray(codec.fromUnicode(os.toString())).toString();
\r
148 if (tidyListener.errorFound) {
\r
149 logger.log(logger.LOW, "Note Contents Begin");
\r
150 logger.log(logger.LOW, content);
\r
151 logger.log(logger.LOW, "Note Contents End");
\r
154 if (newContent.trim().equals(""))
\r
158 // If the repair above returned null, then the XML is foobar.
\r
159 // We are done here.
\r
160 if (newContent == null) {
\r
161 // Houston, we've had a problem.
\r
162 logger.log(logger.LOW, "Parse error when converting to ENML");
\r
163 logger.log(logger.LOW, "Start of unmodified note HTML");
\r
164 logger.log(logger.LOW, content);
\r
165 logger.log(logger.LOW, "End of unmodified note HTML");
\r
166 logger.log(logger.LOW, "Start of modified note HTML");
\r
167 logger.log(logger.LOW, newContent);
\r
168 logger.log(logger.LOW, "End of modified note HTML");
\r
169 // logger.log(logger.LOW, result.errorMessage);
\r
170 // logger.log(logger.LOW, "Error Line:Column "+result.errorLine+":" +result.errorColumn);
\r
176 // Second pass through the data. The goal of this pass is to
\r
177 // remove any things we added in NeverNote that do not match
\r
179 XMLCleanup v = new XMLCleanup();
\r
180 v.setValue(newContent);
\r
181 logger.log(logger.HIGH, "Beginning ENML Cleanup");
\r
183 logger.log(logger.HIGH, "Cleanup complete.");
\r
187 // Final pass through the data. In this one we
\r
188 // remove any invalid attributes and to save the
\r
190 logger.log(logger.EXTREME, "Rebuilt ENML:");
\r
191 logger.log(logger.EXTREME, v.getValue());
\r
192 logger.log(logger.EXTREME, "End Of Rebuilt ENML:");
\r
193 resources = v.getResources();
\r
196 // The XML has the dtd to validate set against Evernote's web
\r
197 // address. We change it to a local one because otherwise it would
\r
198 // fail if the user doesn't have internet connectivity. The local copy
\r
199 // also contains the 3 other PUBLIC definitions at the beginning of the dtd.
\r
200 newContent = v.getValue();
\r
201 File dtdFile = Global.getFileManager().getXMLDirFile("enml2.dtd");
\r
202 String dtd = dtdFile.toURI().toString();
\r
203 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \'http://xml.evernote.com/pub/enml2.dtd'>",
\r
204 "<!DOCTYPE en-note SYSTEM \"" +dtd +"\">");
\r
206 logger.log(logger.HIGH, "Validating ENML");
\r
207 newContent = repair.parse(newContent, true);
\r
208 logger.log(logger.HIGH, "Validation complete");
\r
209 saveInvalidXML = repair.saveInvalidXML;
\r
211 // Restore the correct XML header.
\r
212 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \"" +dtd +"\">",
\r
213 "<!DOCTYPE en-note SYSTEM 'http://xml.evernote.com/pub/enml2.dtd'>");
\r
221 // Fix XML problems that Qt can't deal with
\r
222 public String fixStupidXMLProblems(String content) {
\r
223 logger.log(logger.HIGH, "Entering DBRunner.fixStupidXMLProblems");
\r
225 // Fix the problem that the document body isn't properly closed
\r
226 String newContent = new String(content);
\r
227 logger.log(logger.MEDIUM, "Inside fixStupidXMLProblems. Old content:");
\r
228 logger.log(logger.MEDIUM, content);
\r
230 // Fix the problem that the img tag isn't properly closed
\r
232 logger.log(logger.MEDIUM, "Checking img tags");
\r
233 for (int i=newContent.indexOf("<img"); i>0; i = newContent.indexOf("<img",i+1)) {
\r
234 endPos = newContent.indexOf(">",i+1);
\r
235 String end = newContent.substring(endPos+1);
\r
236 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
239 // Fix the problem that the input tag isn't properly closed
\r
240 logger.log(logger.MEDIUM, "Checking input tags");
\r
241 for (int i=newContent.indexOf("<input"); i>0; i = newContent.indexOf("<input",i+1)) {
\r
242 endPos = newContent.indexOf(">",i+1);
\r
243 String end = newContent.substring(endPos+1);
\r
244 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
248 // Fix the problem that the <br> tag isn't properly closed
\r
249 logger.log(logger.MEDIUM, "Checking br tags");
\r
250 for (int i=newContent.indexOf("<br"); i>0; i = newContent.indexOf("<br",i+1)) {
\r
251 endPos = newContent.indexOf(">",i+1);
\r
252 String end = newContent.substring(endPos+1);
\r
253 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
256 // Fix the problem that the <hr> tag isn't properly closed
\r
257 logger.log(logger.MEDIUM, "Checking hr tags");
\r
258 for (int i=newContent.indexOf("<hr"); i>0; i = newContent.indexOf("<hr",i+1)) {
\r
259 endPos = newContent.indexOf(">",i+1);
\r
260 String end = newContent.substring(endPos+1);
\r
261 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
264 logger.log(logger.MEDIUM, "Leaving fixStupidXMLProblems");
\r
265 logger.log(logger.HIGH, "Leaving DBRunner.fixStupidXMLProblems");
\r
266 return newContent.toString();
\r
270 // Fix XML that Evernote thinks is invalid
\r
271 public String fixEnXMLCrap(String note) {
\r
276 StringBuffer buffer = new StringBuffer(note);
\r
278 // change all <b/> to <b></b> because Evernote hates them if they happen in <span>
\r
279 pos = buffer.indexOf("<b/>");
\r
281 buffer.replace(pos, pos+4, "<b></b>");
\r
282 pos = buffer.indexOf("<b/>",pos);
\r
284 // change all <br/> to <br></br> because Evernote hates them if they happen in <span>
\r
285 pos = buffer.indexOf("<br/>");
\r
287 buffer.replace(pos, pos+5, "<br></br>");
\r
288 pos = buffer.indexOf("<br/>",pos);
\r
291 // change all <span> elements in lists because Evernote hates them if they happen
\r
294 pos = buffer.indexOf("<li>");
\r
295 spanPos = buffer.indexOf("<span>");
\r
296 /* for (; pos>-1 && spanPos >-1;) {
\r
297 endPos = buffer.indexOf("</li>",pos);
\r
298 if (spanPos > pos && spanPos < endPos) {
\r
299 buffer.replace(spanPos,spanPos+6,"");
\r
300 spanPos = buffer.indexOf("</span>");
\r
301 buffer.replace(spanPos,spanPos+7,"");
\r
303 pos=buffer.indexOf("<li>",pos+1);
\r
304 spanPos = buffer.indexOf("<span>",spanPos);
\r
307 // Get rid of empty spans in <li> elements
\r
308 pos = buffer.indexOf("<li>");
\r
309 spanPos = buffer.indexOf("<span/>");
\r
310 for (; pos>-1 && spanPos >-1;) {
\r
311 endPos = buffer.indexOf("</li>",pos);
\r
312 if (spanPos > pos && spanPos < endPos) {
\r
313 buffer.replace(spanPos,spanPos+7,"");
\r
315 pos=buffer.indexOf("<li>",pos+1);
\r
316 spanPos = buffer.indexOf("<span/>",spanPos);
\r
319 return buffer.toString();
\r
322 // Fix stupid en-media problems
\r
323 public String fixEnMediaCrap(String note) {
\r
327 StringBuffer buffer = new StringBuffer(note);
\r
328 // get rid of any </en-media> tags since they shouldn't exist.
\r
329 int pos = buffer.indexOf("</en-media>");
\r
331 buffer.replace(pos, pos+11, "");
\r
332 pos = buffer.indexOf("</en-media>",pos);
\r
336 // Make sure we have a proper /> ending the en-media tag
\r
337 pos = buffer.indexOf("<en-media");
\r
339 pos=buffer.indexOf(">", pos);
\r
340 if (!buffer.substring(pos-1,pos).equals("/"))
\r
341 buffer.replace(pos, pos+1, " />");
\r
342 pos = buffer.indexOf("<en-media",pos);
\r
345 return buffer.toString();
\r