2 * This file is part of NeverNote
\r
3 * Copyright 2009 Randy Baumgarte
\r
5 * This file may be licensed under the terms of of the
\r
6 * GNU General Public License Version 2 (the ``GPL'').
\r
8 * Software distributed under the License is distributed
\r
9 * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
\r
10 * express or implied. See the GPL for the specific language
\r
11 * governing rights and limitations.
\r
13 * You should have received a copy of the GPL along with this
\r
14 * program. If not, go to http://www.gnu.org/licenses/gpl.html
\r
15 * or write to the Free Software Foundation, Inc.,
\r
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
\r
19 package cx.fbn.nevernote.evernote;
\r
21 import java.io.ByteArrayInputStream;
\r
22 import java.io.ByteArrayOutputStream;
\r
23 import java.io.File;
\r
24 import java.util.ArrayList;
\r
25 import java.util.List;
\r
27 import org.w3c.tidy.Tidy;
\r
29 import cx.fbn.nevernote.Global;
\r
30 import cx.fbn.nevernote.utilities.ApplicationLogger;
\r
31 import cx.fbn.nevernote.xml.XMLCleanup;
\r
32 import cx.fbn.nevernote.xml.XMLNoteRepair;
\r
34 public class EnmlConverter {
\r
35 private final ApplicationLogger logger;
\r
36 private List<String> resources;
\r
37 public boolean saveInvalidXML;
\r
39 public EnmlConverter(ApplicationLogger l) {
\r
42 saveInvalidXML = false;
\r
43 resources = new ArrayList<String>();
\r
46 public List<String> getResources() {
\r
49 public String convert(String noteGuid, String content) {
\r
50 logger.log(logger.HIGH, "Entering DBRunner.convertToEnml");
\r
51 logger.log(logger.EXTREME, "Note Text:" +content);
\r
53 // Replace the en-note tags with body tags in case we came from
\r
54 // someplace other than the editor (for example, if we are merging notes).
\r
55 content = content.replace("<en-note>", "<body>");
\r
56 content = content.replace("</en-note>", "</body>");
\r
57 // Start removing stuff we don't need or want
\r
58 int br = content.lastIndexOf("</body>");
\r
60 content = new String(content.substring(0,br));
\r
62 int k = content.indexOf("<body");
\r
64 newContent = new String(content.substring(k));
\r
66 newContent = "<body>"+content;
\r
69 // Check that we have a vaild header. Normally we should not
\r
70 // but sometimes it seems that we can. I don't see how, but it is
\r
71 // easy enough to check.
\r
72 if (!newContent.startsWith("<?xml"))
\r
73 newContent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
\r
74 +"<!DOCTYPE en-note SYSTEM \"http://xml.evernote.com/pub/enml2.dtd\">\n"
\r
79 // Fix the more common XML problems that Webkit creates, but are not considered
\r
81 newContent = fixStupidXMLProblems(newContent);
\r
84 // Change the contents to have enml instead of body tags or
\r
85 // we'll fail validation later.
\r
86 newContent = newContent.replace("<body", "<en-note");
\r
87 newContent = newContent.replace("</body>", "</en-note>");
\r
89 // First pass through the data. The goal of this pass is to
\r
90 // validate that we have a good XML document and to repair
\r
91 // any problems found.
\r
93 XMLNoteRepair repair = new XMLNoteRepair();
\r
94 // logger.log(logger.HIGH, "Checking XML Structure");
\r
95 // newContent = repair.parse(newContent, false);
\r
96 // logger.log(logger.HIGH, "Check complete");
\r
98 Tidy tidy = new Tidy();
\r
99 tidy.setXmlTags(true);
\r
100 byte html[] = newContent.getBytes();
\r
101 ByteArrayInputStream is = new ByteArrayInputStream(html);
\r
102 ByteArrayOutputStream os = new ByteArrayOutputStream();
\r
103 tidy.parse(is, os);
\r
104 newContent = os.toString();
\r
106 if (newContent.trim().equals(""))
\r
109 // If the repair above returned null, then the XML is foobar.
\r
110 // We are done here.
\r
111 if (newContent == null) {
\r
112 // Houston, we've had a problem.
\r
113 logger.log(logger.LOW, "Parse error when converting to ENML");
\r
114 logger.log(logger.LOW, "Start of unmodified note HTML");
\r
115 logger.log(logger.LOW, content);
\r
116 logger.log(logger.LOW, "End of unmodified note HTML");
\r
117 logger.log(logger.LOW, "Start of modified note HTML");
\r
118 logger.log(logger.LOW, newContent);
\r
119 logger.log(logger.LOW, "End of modified note HTML");
\r
120 // logger.log(logger.LOW, result.errorMessage);
\r
121 // logger.log(logger.LOW, "Error Line:Column "+result.errorLine+":" +result.errorColumn);
\r
127 // Second pass through the data. The goal of this pass is to
\r
128 // remove any things we added in NeverNote that do not match
\r
130 XMLCleanup v = new XMLCleanup();
\r
131 v.setValue(newContent);
\r
132 logger.log(logger.HIGH, "Beginning ENML Cleanup");
\r
134 logger.log(logger.HIGH, "Cleanup complete.");
\r
138 // Final pass through the data. In this one we
\r
139 // remove any invalid attributes and to save the
\r
141 logger.log(logger.EXTREME, "Rebuilt ENML:");
\r
142 logger.log(logger.EXTREME, v.getValue());
\r
143 logger.log(logger.EXTREME, "End Of Rebuilt ENML:");
\r
144 resources = v.getResources();
\r
147 // The XML has the dtd to validate set against Evernote's web
\r
148 // address. We change it to a local one because otherwise it would
\r
149 // fail if the user doesn't have internet connectivity. The local copy
\r
150 // also contains the 3 other PUBLIC definitions at the beginning of the dtd.
\r
151 newContent = v.getValue();
\r
152 File dtdFile = Global.getFileManager().getXMLDirFile("enml2.dtd");
\r
153 String dtd = dtdFile.toURI().toString();
\r
154 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \'http://xml.evernote.com/pub/enml2.dtd'>",
\r
155 "<!DOCTYPE en-note SYSTEM \"" +dtd +"\">");
\r
157 logger.log(logger.HIGH, "Validating ENML");
\r
158 newContent = repair.parse(newContent, true);
\r
159 logger.log(logger.HIGH, "Validation complete");
\r
160 saveInvalidXML = repair.saveInvalidXML;
\r
162 // Restore the correct XML header.
\r
163 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \"" +dtd +"\">",
\r
164 "<!DOCTYPE en-note SYSTEM 'http://xml.evernote.com/pub/enml2.dtd'>");
\r
172 // Fix XML problems that Qt can't deal with
\r
173 public String fixStupidXMLProblems(String content) {
\r
174 logger.log(logger.HIGH, "Entering DBRunner.fixStupidXMLProblems");
\r
176 // Fix the problem that the document body isn't properly closed
\r
177 String newContent = new String(content);
\r
178 logger.log(logger.MEDIUM, "Inside fixStupidXMLProblems. Old content:");
\r
179 logger.log(logger.MEDIUM, content);
\r
181 // Fix the problem that the img tag isn't properly closed
\r
183 logger.log(logger.MEDIUM, "Checking img tags");
\r
184 for (int i=newContent.indexOf("<img"); i>0; i = newContent.indexOf("<img",i+1)) {
\r
185 endPos = newContent.indexOf(">",i+1);
\r
186 String end = newContent.substring(endPos+1);
\r
187 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
190 // Fix the problem that the input tag isn't properly closed
\r
191 logger.log(logger.MEDIUM, "Checking input tags");
\r
192 for (int i=newContent.indexOf("<input"); i>0; i = newContent.indexOf("<input",i+1)) {
\r
193 endPos = newContent.indexOf(">",i+1);
\r
194 String end = newContent.substring(endPos+1);
\r
195 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
199 // Fix the problem that the <br> tag isn't properly closed
\r
200 logger.log(logger.MEDIUM, "Checking br tags");
\r
201 for (int i=newContent.indexOf("<br"); i>0; i = newContent.indexOf("<br",i+1)) {
\r
202 endPos = newContent.indexOf(">",i+1);
\r
203 String end = newContent.substring(endPos+1);
\r
204 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
207 // Fix the problem that the <hr> tag isn't properly closed
\r
208 logger.log(logger.MEDIUM, "Checking hr tags");
\r
209 for (int i=newContent.indexOf("<hr"); i>0; i = newContent.indexOf("<hr",i+1)) {
\r
210 endPos = newContent.indexOf(">",i+1);
\r
211 String end = newContent.substring(endPos+1);
\r
212 newContent = newContent.subSequence(0,endPos) +"/>"+end;
\r
215 logger.log(logger.MEDIUM, "Leaving fixStupidXMLProblems");
\r
216 logger.log(logger.HIGH, "Leaving DBRunner.fixStupidXMLProblems");
\r
217 return newContent.toString();
\r
221 // Fix XML that Evernote thinks is invalid
\r
222 public String fixEnXMLCrap(String note) {
\r
227 StringBuffer buffer = new StringBuffer(note);
\r
229 // change all <b/> to <b></b> because Evernote hates them if they happen in <span>
\r
230 pos = buffer.indexOf("<b/>");
\r
232 buffer.replace(pos, pos+4, "<b></b>");
\r
233 pos = buffer.indexOf("<b/>",pos);
\r
235 // change all <br/> to <br></br> because Evernote hates them if they happen in <span>
\r
236 pos = buffer.indexOf("<br/>");
\r
238 buffer.replace(pos, pos+5, "<br></br>");
\r
239 pos = buffer.indexOf("<br/>",pos);
\r
242 // change all <span> elements in lists because Evernote hates them if they happen
\r
245 pos = buffer.indexOf("<li>");
\r
246 spanPos = buffer.indexOf("<span>");
\r
247 /* for (; pos>-1 && spanPos >-1;) {
\r
248 endPos = buffer.indexOf("</li>",pos);
\r
249 if (spanPos > pos && spanPos < endPos) {
\r
250 buffer.replace(spanPos,spanPos+6,"");
\r
251 spanPos = buffer.indexOf("</span>");
\r
252 buffer.replace(spanPos,spanPos+7,"");
\r
254 pos=buffer.indexOf("<li>",pos+1);
\r
255 spanPos = buffer.indexOf("<span>",spanPos);
\r
258 // Get rid of empty spans in <li> elements
\r
259 pos = buffer.indexOf("<li>");
\r
260 spanPos = buffer.indexOf("<span/>");
\r
261 for (; pos>-1 && spanPos >-1;) {
\r
262 endPos = buffer.indexOf("</li>",pos);
\r
263 if (spanPos > pos && spanPos < endPos) {
\r
264 buffer.replace(spanPos,spanPos+7,"");
\r
266 pos=buffer.indexOf("<li>",pos+1);
\r
267 spanPos = buffer.indexOf("<span/>",spanPos);
\r
270 return buffer.toString();
\r
273 // Fix stupid en-media problems
\r
274 public String fixEnMediaCrap(String note) {
\r
278 StringBuffer buffer = new StringBuffer(note);
\r
279 // get rid of any </en-media> tags since they shouldn't exist.
\r
280 int pos = buffer.indexOf("</en-media>");
\r
282 buffer.replace(pos, pos+11, "");
\r
283 pos = buffer.indexOf("</en-media>",pos);
\r
287 // Make sure we have a proper /> ending the en-media tag
\r
288 pos = buffer.indexOf("<en-media");
\r
290 pos=buffer.indexOf(">", pos);
\r
291 if (!buffer.substring(pos-1,pos).equals("/"))
\r
292 buffer.replace(pos, pos+1, " />");
\r
293 pos = buffer.indexOf("<en-media",pos);
\r
296 return buffer.toString();
\r