2 * This file is part of NeverNote
\r
3 * Copyright 2009 Randy Baumgarte
\r
5 * This file may be licensed under the terms of of the
\r
6 * GNU General Public License Version 2 (the ``GPL'').
\r
8 * Software distributed under the License is distributed
\r
9 * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
\r
10 * express or implied. See the GPL for the specific language
\r
11 * governing rights and limitations.
\r
13 * You should have received a copy of the GPL along with this
\r
14 * program. If not, go to http://www.gnu.org/licenses/gpl.html
\r
15 * or write to the Free Software Foundation, Inc.,
\r
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
\r
19 package cx.fbn.nevernote.evernote;
\r
21 import java.io.ByteArrayInputStream;
\r
22 import java.io.ByteArrayOutputStream;
\r
23 import java.io.File;
\r
24 import java.util.ArrayList;
\r
25 import java.util.List;
\r
27 import org.w3c.tidy.Tidy;
\r
28 import org.w3c.tidy.TidyMessage;
\r
30 import cx.fbn.nevernote.Global;
\r
31 import cx.fbn.nevernote.utilities.ApplicationLogger;
\r
32 import cx.fbn.nevernote.xml.XMLCleanup;
\r
33 import cx.fbn.nevernote.xml.XMLNoteRepair;
\r
35 public class EnmlConverter {
\r
36 private final ApplicationLogger logger;
\r
37 private List<String> resources;
\r
38 public boolean saveInvalidXML;
\r
40 private class TidyListener implements org.w3c.tidy.TidyMessageListener {
\r
42 ApplicationLogger logger;
\r
43 public boolean errorFound;
\r
45 public TidyListener(ApplicationLogger logger) {
\r
46 this.logger = logger;
\r
50 public void messageReceived(TidyMessage msg) {
\r
51 if (msg.getLevel() == TidyMessage.Level.ERROR) {
\r
52 logger.log(logger.LOW, "******* JTIDY ERORR *******");
\r
53 logger.log(logger.LOW, "Error Code: " +msg.getErrorCode());
\r
54 logger.log(logger.LOW, "Column: " +msg.getColumn());
\r
55 logger.log(logger.LOW, "Column: " +msg.getColumn());
\r
56 logger.log(logger.LOW, "Line: " +msg.getLine());
\r
57 logger.log(logger.LOW, "Message: " +msg.getMessage());
\r
58 logger.log(logger.LOW, "***************************");
\r
61 logger.log(logger.EXTREME, "JTidy Results: "+msg.getMessage());
\r
66 public EnmlConverter(ApplicationLogger l) {
\r
69 saveInvalidXML = false;
\r
70 resources = new ArrayList<String>();
\r
73 public List<String> getResources() {
\r
76 public String convert(String noteGuid, String content) {
\r
77 logger.log(logger.HIGH, "Entering DBRunner.convertToEnml");
\r
78 logger.log(logger.EXTREME, "Note Text:" +content);
\r
80 // Replace the en-note tags with body tags in case we came from
\r
81 // someplace other than the editor (for example, if we are merging notes).
\r
82 content = content.replace("<en-note>", "<body>");
\r
83 content = content.replace("</en-note>", "</body>");
\r
84 // Start removing stuff we don't need or want
\r
85 int br = content.lastIndexOf("</body>");
\r
87 content = new String(content.substring(0,br));
\r
89 int k = content.indexOf("<body");
\r
91 newContent = new String(content.substring(k));
\r
93 newContent = "<body>"+content;
\r
96 // Check that we have a vaild header. Normally we should not
\r
97 // but sometimes it seems that we can. I don't see how, but it is
\r
98 // easy enough to check.
\r
99 if (!newContent.startsWith("<?xml"))
\r
100 newContent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
\r
101 +"<!DOCTYPE en-note SYSTEM \"http://xml.evernote.com/pub/enml2.dtd\">\n"
\r
106 // Fix the more common XML problems that Webkit creates, but are not considered
\r
108 newContent = fixStupidXMLProblems(newContent);
\r
111 // Change the contents to have enml instead of body tags or
\r
112 // we'll fail validation later.
\r
113 newContent = newContent.replace("<body", "<en-note");
\r
114 newContent = newContent.replace("</body>", "</en-note>");
\r
116 // First pass through the data. The goal of this pass is to
\r
117 // validate that we have a good XML document and to repair
\r
118 // any problems found.
\r
120 XMLNoteRepair repair = new XMLNoteRepair();
\r
121 // logger.log(logger.HIGH, "Checking XML Structure");
\r
122 // newContent = repair.parse(newContent, false);
\r
123 // logger.log(logger.HIGH, "Check complete");
\r
125 Tidy tidy = new Tidy();
\r
126 TidyListener tidyListener = new TidyListener(logger);
\r
127 tidy.setMessageListener(tidyListener);
\r
128 tidy.getStderr().close(); // the listener will capture messages
\r
129 tidy.setXmlTags(true);
\r
130 byte html[] = newContent.getBytes();
\r
131 ByteArrayInputStream is = new ByteArrayInputStream(html);
\r
132 ByteArrayOutputStream os = new ByteArrayOutputStream();
\r
133 tidy.parse(is, os);
\r
134 newContent = os.toString();
\r
136 if (tidyListener.errorFound) {
\r
137 logger.log(logger.LOW, "Note Contents Begin");
\r
138 logger.log(logger.LOW, content);
\r
139 logger.log(logger.LOW, "Note Contents End");
\r
142 if (newContent.trim().equals(""))
\r
146 // If the repair above returned null, then the XML is foobar.
\r
147 // We are done here.
\r
148 if (newContent == null) {
\r
149 // Houston, we've had a problem.
\r
150 logger.log(logger.LOW, "Parse error when converting to ENML");
\r
151 logger.log(logger.LOW, "Start of unmodified note HTML");
\r
152 logger.log(logger.LOW, content);
\r
153 logger.log(logger.LOW, "End of unmodified note HTML");
\r
154 logger.log(logger.LOW, "Start of modified note HTML");
\r
155 logger.log(logger.LOW, newContent);
\r
156 logger.log(logger.LOW, "End of modified note HTML");
\r
157 // logger.log(logger.LOW, result.errorMessage);
\r
158 // logger.log(logger.LOW, "Error Line:Column "+result.errorLine+":" +result.errorColumn);
\r
164 // Second pass through the data. The goal of this pass is to
\r
165 // remove any things we added in NeverNote that do not match
\r
167 XMLCleanup v = new XMLCleanup();
\r
168 v.setValue(newContent);
\r
169 logger.log(logger.HIGH, "Beginning ENML Cleanup");
\r
171 logger.log(logger.HIGH, "Cleanup complete.");
\r
175 // Final pass through the data. In this one we
\r
176 // remove any invalid attributes and to save the
\r
178 logger.log(logger.EXTREME, "Rebuilt ENML:");
\r
179 logger.log(logger.EXTREME, v.getValue());
\r
180 logger.log(logger.EXTREME, "End Of Rebuilt ENML:");
\r
181 resources = v.getResources();
\r
184 // The XML has the dtd to validate set against Evernote's web
\r
185 // address. We change it to a local one because otherwise it would
\r
186 // fail if the user doesn't have internet connectivity. The local copy
\r
187 // also contains the 3 other PUBLIC definitions at the beginning of the dtd.
\r
188 newContent = v.getValue();
\r
189 File dtdFile = Global.getFileManager().getXMLDirFile("enml2.dtd");
\r
190 String dtd = dtdFile.toURI().toString();
\r
191 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \'http://xml.evernote.com/pub/enml2.dtd'>",
\r
192 "<!DOCTYPE en-note SYSTEM \"" +dtd +"\">");
\r
194 logger.log(logger.HIGH, "Validating ENML");
\r
195 newContent = repair.parse(newContent, true);
\r
196 logger.log(logger.HIGH, "Validation complete");
\r
197 saveInvalidXML = repair.saveInvalidXML;
\r
199 // Restore the correct XML header.
\r
200 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \"" +dtd +"\">",
\r
201 "<!DOCTYPE en-note SYSTEM 'http://xml.evernote.com/pub/enml2.dtd'>");
\r
209 // Fix XML problems that Qt can't deal with
\r
210 public String fixStupidXMLProblems(String content) {
\r
211 logger.log(logger.HIGH, "Entering DBRunner.fixStupidXMLProblems");
\r
213 // Fix the problem that the document body isn't properly closed
\r
214 String newContent = new String(content);
\r
215 logger.log(logger.MEDIUM, "Inside fixStupidXMLProblems. Old content:");
\r
216 logger.log(logger.MEDIUM, content);
\r
218 // Fix the problem that the img tag isn't properly closed
\r
220 logger.log(logger.MEDIUM, "Checking img tags");
\r
221 for (int i=newContent.indexOf("<img"); i>0; i = newContent.indexOf("<img",i+1)) {
\r
222 endPos = newContent.indexOf(">",i+1);
\r
223 String end = newContent.substring(endPos+1);
\r
224 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
227 // Fix the problem that the input tag isn't properly closed
\r
228 logger.log(logger.MEDIUM, "Checking input tags");
\r
229 for (int i=newContent.indexOf("<input"); i>0; i = newContent.indexOf("<input",i+1)) {
\r
230 endPos = newContent.indexOf(">",i+1);
\r
231 String end = newContent.substring(endPos+1);
\r
232 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
236 // Fix the problem that the <br> tag isn't properly closed
\r
237 logger.log(logger.MEDIUM, "Checking br tags");
\r
238 for (int i=newContent.indexOf("<br"); i>0; i = newContent.indexOf("<br",i+1)) {
\r
239 endPos = newContent.indexOf(">",i+1);
\r
240 String end = newContent.substring(endPos+1);
\r
241 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
244 // Fix the problem that the <hr> tag isn't properly closed
\r
245 logger.log(logger.MEDIUM, "Checking hr tags");
\r
246 for (int i=newContent.indexOf("<hr"); i>0; i = newContent.indexOf("<hr",i+1)) {
\r
247 endPos = newContent.indexOf(">",i+1);
\r
248 String end = newContent.substring(endPos+1);
\r
249 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
252 logger.log(logger.MEDIUM, "Leaving fixStupidXMLProblems");
\r
253 logger.log(logger.HIGH, "Leaving DBRunner.fixStupidXMLProblems");
\r
254 return newContent.toString();
\r
258 // Fix XML that Evernote thinks is invalid
\r
259 public String fixEnXMLCrap(String note) {
\r
264 StringBuffer buffer = new StringBuffer(note);
\r
266 // change all <b/> to <b></b> because Evernote hates them if they happen in <span>
\r
267 pos = buffer.indexOf("<b/>");
\r
269 buffer.replace(pos, pos+4, "<b></b>");
\r
270 pos = buffer.indexOf("<b/>",pos);
\r
272 // change all <br/> to <br></br> because Evernote hates them if they happen in <span>
\r
273 pos = buffer.indexOf("<br/>");
\r
275 buffer.replace(pos, pos+5, "<br></br>");
\r
276 pos = buffer.indexOf("<br/>",pos);
\r
279 // change all <span> elements in lists because Evernote hates them if they happen
\r
282 pos = buffer.indexOf("<li>");
\r
283 spanPos = buffer.indexOf("<span>");
\r
284 /* for (; pos>-1 && spanPos >-1;) {
\r
285 endPos = buffer.indexOf("</li>",pos);
\r
286 if (spanPos > pos && spanPos < endPos) {
\r
287 buffer.replace(spanPos,spanPos+6,"");
\r
288 spanPos = buffer.indexOf("</span>");
\r
289 buffer.replace(spanPos,spanPos+7,"");
\r
291 pos=buffer.indexOf("<li>",pos+1);
\r
292 spanPos = buffer.indexOf("<span>",spanPos);
\r
295 // Get rid of empty spans in <li> elements
\r
296 pos = buffer.indexOf("<li>");
\r
297 spanPos = buffer.indexOf("<span/>");
\r
298 for (; pos>-1 && spanPos >-1;) {
\r
299 endPos = buffer.indexOf("</li>",pos);
\r
300 if (spanPos > pos && spanPos < endPos) {
\r
301 buffer.replace(spanPos,spanPos+7,"");
\r
303 pos=buffer.indexOf("<li>",pos+1);
\r
304 spanPos = buffer.indexOf("<span/>",spanPos);
\r
307 return buffer.toString();
\r
310 // Fix stupid en-media problems
\r
311 public String fixEnMediaCrap(String note) {
\r
315 StringBuffer buffer = new StringBuffer(note);
\r
316 // get rid of any </en-media> tags since they shouldn't exist.
\r
317 int pos = buffer.indexOf("</en-media>");
\r
319 buffer.replace(pos, pos+11, "");
\r
320 pos = buffer.indexOf("</en-media>",pos);
\r
324 // Make sure we have a proper /> ending the en-media tag
\r
325 pos = buffer.indexOf("<en-media");
\r
327 pos=buffer.indexOf(">", pos);
\r
328 if (!buffer.substring(pos-1,pos).equals("/"))
\r
329 buffer.replace(pos, pos+1, " />");
\r
330 pos = buffer.indexOf("<en-media",pos);
\r
333 return buffer.toString();
\r