OSDN Git Service

Correct problem parsing notes where carriage returns did not separate some XML lines...
[neighbornote/NeighborNote.git] / src / cx / fbn / nevernote / evernote / EnmlConverter.java
1 /*\r
2  * This file is part of NeverNote \r
3  * Copyright 2009 Randy Baumgarte\r
4  * \r
5  * This file may be licensed under the terms of of the\r
6  * GNU General Public License Version 2 (the ``GPL'').\r
7  *\r
8  * Software distributed under the License is distributed\r
9  * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either\r
10  * express or implied. See the GPL for the specific language\r
11  * governing rights and limitations.\r
12  *\r
13  * You should have received a copy of the GPL along with this\r
14  * program. If not, go to http://www.gnu.org/licenses/gpl.html\r
15  * or write to the Free Software Foundation, Inc.,\r
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\r
17  *\r
18 */\r
19 package cx.fbn.nevernote.evernote;\r
20 \r
21 import java.io.ByteArrayInputStream;\r
22 import java.io.ByteArrayOutputStream;\r
23 import java.io.File;\r
24 import java.util.ArrayList;\r
25 import java.util.List;\r
26 \r
27 import org.w3c.tidy.Tidy;\r
28 import org.w3c.tidy.TidyMessage;\r
29 \r
30 import cx.fbn.nevernote.Global;\r
31 import cx.fbn.nevernote.utilities.ApplicationLogger;\r
32 import cx.fbn.nevernote.xml.XMLCleanup;\r
33 import cx.fbn.nevernote.xml.XMLNoteRepair;\r
34 \r
35 public class EnmlConverter {\r
36         private final ApplicationLogger logger;\r
37         private List<String>                    resources;\r
38         public boolean saveInvalidXML;\r
39         \r
40         private class TidyListener implements org.w3c.tidy.TidyMessageListener {\r
41                 \r
42                 ApplicationLogger logger;\r
43                 public boolean errorFound; \r
44                 \r
45                 public TidyListener(ApplicationLogger logger) {\r
46                         this.logger = logger;\r
47                         errorFound = false;\r
48                 }\r
49                 @Override\r
50                 public void messageReceived(TidyMessage msg) {\r
51                         if (msg.getLevel() == TidyMessage.Level.ERROR) {\r
52                                 logger.log(logger.LOW, "******* JTIDY ERORR *******");\r
53                                 logger.log(logger.LOW, "Error Code: " +msg.getErrorCode());\r
54                                 logger.log(logger.LOW, "Column: " +msg.getColumn());\r
55                                 logger.log(logger.LOW, "Column: " +msg.getColumn());\r
56                                 logger.log(logger.LOW, "Line: " +msg.getLine());\r
57                                 logger.log(logger.LOW, "Message: " +msg.getMessage());\r
58                                 logger.log(logger.LOW, "***************************");\r
59                                 errorFound = true;\r
60                         } else \r
61                                 logger.log(logger.EXTREME, "JTidy Results: "+msg.getMessage());\r
62                 }\r
63                 \r
64         }\r
65         \r
66         public EnmlConverter(ApplicationLogger l) {\r
67                 logger = l;\r
68 //              conn = c;\r
69                 saveInvalidXML = false;\r
70                 resources = new ArrayList<String>();\r
71         }\r
72 \r
73         public List<String> getResources() {\r
74                 return resources;\r
75         }\r
76         public String convert(String noteGuid, String content) {\r
77                 logger.log(logger.HIGH, "Entering DBRunner.convertToEnml");\r
78                 logger.log(logger.EXTREME, "Note Text:" +content);\r
79                 \r
80                 // Replace the en-note tags with body tags in case we came from \r
81                 // someplace other than the editor (for example, if we are merging notes).\r
82                 content = content.replace("<en-note>", "<body>");\r
83                 content = content.replace("</en-note>", "</body>");\r
84                 // Start removing stuff we don't need or want\r
85                 int br = content.lastIndexOf("</body>");\r
86                 if (br > 0)\r
87                         content = new String(content.substring(0,br));\r
88                 String newContent;\r
89                 int k = content.indexOf("<body");\r
90                 if (k>-1)\r
91                         newContent = new String(content.substring(k));\r
92                 else\r
93                         newContent = "<body>"+content;\r
94 \r
95                 \r
96                 // Check that we have a vaild header.  Normally we should not\r
97                 // but sometimes it seems that we can.  I don't see how, but it is\r
98                 // easy enough to check.\r
99                 if (!newContent.startsWith("<?xml"))\r
100                         newContent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" \r
101                                 +"<!DOCTYPE en-note SYSTEM \"http://xml.evernote.com/pub/enml2.dtd\">\n"\r
102                                 +newContent \r
103                                 +"</body>";\r
104                 \r
105 \r
106                 // Fix the more common XML problems that Webkit creates, but are not considered \r
107                 // valid XML.\r
108                 newContent = fixStupidXMLProblems(newContent);\r
109                 \r
110                 \r
111                 // Change the contents to have enml instead of body tags or\r
112                 // we'll fail validation later.\r
113                 newContent = newContent.replace("<body", "<en-note");\r
114                 newContent = newContent.replace("</body>", "</en-note>");\r
115                 \r
116                 // First pass through the data.  The goal of this pass is to \r
117                 // validate that we have a good XML document and to repair\r
118                 // any problems found.\r
119                 \r
120                 XMLNoteRepair repair = new XMLNoteRepair();\r
121 //              logger.log(logger.HIGH, "Checking XML Structure");\r
122 //              newContent = repair.parse(newContent, false);\r
123 //              logger.log(logger.HIGH, "Check complete");\r
124         \r
125                 Tidy tidy = new Tidy();\r
126                 TidyListener tidyListener = new TidyListener(logger);\r
127                 tidy.setMessageListener(tidyListener);\r
128                 tidy.getStderr().close();  // the listener will capture messages\r
129                 tidy.setXmlTags(true);\r
130                 byte html[] = newContent.getBytes();\r
131                 ByteArrayInputStream is = new ByteArrayInputStream(html);\r
132                 ByteArrayOutputStream os = new ByteArrayOutputStream();\r
133                 tidy.parse(is, os);\r
134                 newContent = os.toString();\r
135                 \r
136                 if (tidyListener.errorFound) {\r
137                         logger.log(logger.LOW, "Note Contents Begin");\r
138                         logger.log(logger.LOW, content);\r
139                         logger.log(logger.LOW, "Note Contents End");\r
140                         newContent = null;\r
141                 } else {\r
142                         if (newContent.trim().equals(""))\r
143                                 newContent = null;\r
144                 }\r
145 \r
146                 // If the repair above returned null, then the XML is foobar.\r
147                 // We are done here.\r
148                 if (newContent == null) {\r
149                         // Houston, we've had a problem.\r
150                         logger.log(logger.LOW, "Parse error when converting to ENML");\r
151                         logger.log(logger.LOW, "Start of unmodified note HTML");\r
152                         logger.log(logger.LOW, content);\r
153                         logger.log(logger.LOW, "End of unmodified note HTML");\r
154                         logger.log(logger.LOW, "Start of modified note HTML");\r
155                         logger.log(logger.LOW, newContent);\r
156                         logger.log(logger.LOW, "End of modified note HTML");\r
157 //                              logger.log(logger.LOW, result.errorMessage);\r
158 //                              logger.log(logger.LOW, "Error Line:Column "+result.errorLine+":" +result.errorColumn);\r
159                         return null;\r
160 \r
161 \r
162                 }\r
163                 \r
164                 // Second pass through the data.  The goal of this pass is to \r
165                 // remove any things we added in NeverNote that do not match\r
166                 // the ENML schema\r
167                 XMLCleanup v = new XMLCleanup();\r
168                 v.setValue(newContent);\r
169                 logger.log(logger.HIGH, "Beginning ENML Cleanup");\r
170                 v.validate();\r
171                 logger.log(logger.HIGH, "Cleanup complete.");\r
172                 \r
173         \r
174                         \r
175                 // Final pass through the data.  In this one we\r
176                 // remove any invalid attributes and to save the\r
177                 // new resources.\r
178                 logger.log(logger.EXTREME, "Rebuilt ENML:");\r
179                 logger.log(logger.EXTREME, v.getValue());       \r
180                 logger.log(logger.EXTREME, "End Of Rebuilt ENML:");\r
181                 resources = v.getResources();\r
182 \r
183                 \r
184                 // The XML has the dtd to validate set against Evernote's web\r
185                 // address.  We change it to a local one because otherwise it would\r
186                 // fail if the user doesn't have internet connectivity.  The local copy\r
187                 // also contains the 3 other PUBLIC definitions at the beginning of the dtd.\r
188                 newContent = v.getValue();\r
189                 File dtdFile = Global.getFileManager().getXMLDirFile("enml2.dtd");\r
190                 String dtd = dtdFile.toURI().toString();\r
191                 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \'http://xml.evernote.com/pub/enml2.dtd'>", \r
192                                 "<!DOCTYPE en-note SYSTEM \"" +dtd +"\">");\r
193                 \r
194                 logger.log(logger.HIGH, "Validating ENML");\r
195                 newContent = repair.parse(newContent, true);\r
196                 logger.log(logger.HIGH, "Validation complete");\r
197                 saveInvalidXML = repair.saveInvalidXML;\r
198                 \r
199                 // Restore the correct XML header.\r
200                 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \"" +dtd +"\">", \r
201                                 "<!DOCTYPE en-note SYSTEM 'http://xml.evernote.com/pub/enml2.dtd'>");\r
202                 \r
203                 \r
204                 \r
205                 return newContent;\r
206         }\r
207         \r
208         \r
209         // Fix XML problems that Qt can't deal with\r
210         public String fixStupidXMLProblems(String content) {\r
211                 logger.log(logger.HIGH, "Entering DBRunner.fixStupidXMLProblems");\r
212 \r
213                 // Fix the problem that the document body isn't properly closed\r
214                 String newContent = new String(content);\r
215                 logger.log(logger.MEDIUM, "Inside fixStupidXMLProblems.  Old content:");\r
216                 logger.log(logger.MEDIUM, content);\r
217                 \r
218                 // Fix the problem that the img tag isn't properly closed\r
219                 int endPos;\r
220                 logger.log(logger.MEDIUM, "Checking img tags");\r
221                 for (int i=newContent.indexOf("<img"); i>0; i = newContent.indexOf("<img",i+1)) {\r
222                         endPos = newContent.indexOf(">",i+1);\r
223                         String end = newContent.substring(endPos+1);\r
224                         newContent = newContent.subSequence(0,endPos) +"/>"+end;\r
225                 }\r
226                 \r
227                 // Fix the problem that the input tag isn't properly closed\r
228                 logger.log(logger.MEDIUM, "Checking input tags");\r
229                 for (int i=newContent.indexOf("<input"); i>0; i = newContent.indexOf("<input",i+1)) {\r
230                         endPos = newContent.indexOf(">",i+1);\r
231                         String end = newContent.substring(endPos+1);\r
232                         newContent = newContent.subSequence(0,endPos) +"/>"+end;\r
233                 }\r
234                 \r
235                 \r
236                 // Fix the problem that the <br> tag isn't properly closed\r
237                 logger.log(logger.MEDIUM, "Checking br tags");\r
238                 for (int i=newContent.indexOf("<br"); i>0; i = newContent.indexOf("<br",i+1)) {\r
239                         endPos = newContent.indexOf(">",i+1);\r
240                         String end = newContent.substring(endPos+1);\r
241                         newContent = newContent.subSequence(0,endPos) +"/>"+end;\r
242                 }\r
243                         \r
244                 // Fix the problem that the <hr> tag isn't properly closed\r
245                 logger.log(logger.MEDIUM, "Checking hr tags");\r
246                 for (int i=newContent.indexOf("<hr"); i>0; i = newContent.indexOf("<hr",i+1)) {\r
247                         endPos = newContent.indexOf(">",i+1);\r
248                         String end = newContent.substring(endPos+1);\r
249                         newContent = newContent.subSequence(0,endPos) +"/>"+end;\r
250                 }\r
251                 \r
252                 logger.log(logger.MEDIUM, "Leaving fixStupidXMLProblems");\r
253                 logger.log(logger.HIGH, "Leaving DBRunner.fixStupidXMLProblems");\r
254                 return newContent.toString();\r
255         }\r
256 \r
257 \r
258         // Fix XML that Evernote thinks is invalid\r
259         public String fixEnXMLCrap(String note) {\r
260                 if (note == null)\r
261                         return null;\r
262                 \r
263                 int pos;\r
264                 StringBuffer buffer = new StringBuffer(note);\r
265                 \r
266                 // change all <b/> to <b></b> because Evernote hates them if they happen in <span>\r
267                 pos = buffer.indexOf("<b/>");\r
268                 for (; pos>-1; ) {\r
269                         buffer.replace(pos, pos+4, "<b></b>");\r
270                         pos = buffer.indexOf("<b/>",pos);\r
271                 }\r
272                 // change all <br/> to <br></br> because Evernote hates them if they happen in <span>\r
273                 pos = buffer.indexOf("<br/>");\r
274                 for (; pos>-1; ) {\r
275                         buffer.replace(pos, pos+5, "<br></br>");\r
276                         pos = buffer.indexOf("<br/>",pos);\r
277                 }\r
278                 \r
279                 // change all <span> elements in lists because Evernote hates them if they happen \r
280                 int endPos = 0;\r
281                 int spanPos;\r
282                 pos = buffer.indexOf("<li>");\r
283                 spanPos = buffer.indexOf("<span>");\r
284 /*              for (; pos>-1 && spanPos >-1;) {\r
285                         endPos = buffer.indexOf("</li>",pos);\r
286                         if (spanPos > pos && spanPos < endPos) {\r
287                                 buffer.replace(spanPos,spanPos+6,"");\r
288                                 spanPos = buffer.indexOf("</span>");                            \r
289                                 buffer.replace(spanPos,spanPos+7,"");\r
290                         }\r
291                         pos=buffer.indexOf("<li>",pos+1);\r
292                         spanPos = buffer.indexOf("<span>",spanPos);\r
293                 }\r
294 */              \r
295                 // Get rid of empty spans in <li> elements\r
296                 pos = buffer.indexOf("<li>");\r
297                 spanPos = buffer.indexOf("<span/>");\r
298                 for (; pos>-1 && spanPos >-1;) {\r
299                         endPos = buffer.indexOf("</li>",pos);\r
300                         if (spanPos > pos && spanPos < endPos) {\r
301                                 buffer.replace(spanPos,spanPos+7,"");\r
302                         }\r
303                         pos=buffer.indexOf("<li>",pos+1);\r
304                         spanPos = buffer.indexOf("<span/>",spanPos);\r
305                 }\r
306                 \r
307                 return buffer.toString();\r
308         }\r
309         \r
310         // Fix stupid en-media problems\r
311         public String fixEnMediaCrap(String note) {\r
312                 if (note == null)\r
313                         return null;\r
314                 \r
315                 StringBuffer buffer = new StringBuffer(note);\r
316                 // get rid of any </en-media> tags since they shouldn't exist.\r
317                 int pos = buffer.indexOf("</en-media>");\r
318                 for (; pos>-1; ) {\r
319                         buffer.replace(pos, pos+11, "");\r
320                         pos = buffer.indexOf("</en-media>",pos);\r
321                 }\r
322                 \r
323                 \r
324                 // Make sure we have a proper /> ending the en-media tag\r
325                 pos = buffer.indexOf("<en-media");\r
326                 for (; pos>-1; ) {\r
327                         pos=buffer.indexOf(">", pos);\r
328                         if (!buffer.substring(pos-1,pos).equals("/"))\r
329                         buffer.replace(pos, pos+1, " />");\r
330                         pos = buffer.indexOf("<en-media",pos);\r
331                 }\r
332                 \r
333                 return buffer.toString();\r
334         }\r
335 }\r