OSDN Git Service

Updated comments in some of the code.
[neighbornote/NeighborNote.git] / src / cx / fbn / nevernote / evernote / EnmlConverter.java
1 /*\r
2  * This file is part of NeverNote \r
3  * Copyright 2009 Randy Baumgarte\r
4  * \r
5  * This file may be licensed under the terms of of the\r
6  * GNU General Public License Version 2 (the ``GPL'').\r
7  *\r
8  * Software distributed under the License is distributed\r
9  * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either\r
10  * express or implied. See the GPL for the specific language\r
11  * governing rights and limitations.\r
12  *\r
13  * You should have received a copy of the GPL along with this\r
14  * program. If not, go to http://www.gnu.org/licenses/gpl.html\r
15  * or write to the Free Software Foundation, Inc.,\r
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\r
17  *\r
18 */\r
19 package cx.fbn.nevernote.evernote;\r
20 \r
21 //**********************************************\r
22 //**********************************************\r
23 //* This is used to turn HTML into ENML compliant\r
24 //* data.\r
25 //**********************************************\r
26 //**********************************************\r
27 \r
28 import java.io.ByteArrayInputStream;\r
29 import java.io.ByteArrayOutputStream;\r
30 import java.io.File;\r
31 import java.util.ArrayList;\r
32 import java.util.List;\r
33 \r
34 import org.w3c.tidy.Tidy;\r
35 import org.w3c.tidy.TidyMessage;\r
36 \r
37 import com.trolltech.qt.core.QByteArray;\r
38 import com.trolltech.qt.core.QTextCodec;\r
39 \r
40 import cx.fbn.nevernote.Global;\r
41 import cx.fbn.nevernote.utilities.ApplicationLogger;\r
42 import cx.fbn.nevernote.utilities.Pair;\r
43 import cx.fbn.nevernote.xml.XMLCleanup;\r
44 import cx.fbn.nevernote.xml.XMLNoteRepair;\r
45 \r
46 public class EnmlConverter {\r
47         private final ApplicationLogger logger;\r
48         private List<String>                    resources;\r
49         public boolean saveInvalidXML;\r
50         \r
51         private class TidyListener implements org.w3c.tidy.TidyMessageListener {\r
52                 \r
53                 ApplicationLogger logger;\r
54                 public boolean errorFound; \r
55                 \r
56                 public TidyListener(ApplicationLogger logger) {\r
57                         this.logger = logger;\r
58                         errorFound = false;\r
59                 }\r
60                 @Override\r
61                 public void messageReceived(TidyMessage msg) {\r
62                         if (msg.getLevel() == TidyMessage.Level.ERROR) {\r
63                                 logger.log(logger.LOW, "******* JTIDY ERORR *******");\r
64                                 logger.log(logger.LOW, "Error Code: " +msg.getErrorCode());\r
65                                 logger.log(logger.LOW, "Column: " +msg.getColumn());\r
66                                 logger.log(logger.LOW, "Column: " +msg.getColumn());\r
67                                 logger.log(logger.LOW, "Line: " +msg.getLine());\r
68                                 logger.log(logger.LOW, "Message: " +msg.getMessage());\r
69                                 logger.log(logger.LOW, "***************************");\r
70                                 errorFound = true;\r
71                         } else \r
72                                 logger.log(logger.EXTREME, "JTidy Results: "+msg.getMessage());\r
73                 }\r
74                 \r
75         }\r
76         \r
77         public EnmlConverter(ApplicationLogger l) {\r
78                 logger = l;\r
79 //              conn = c;\r
80                 saveInvalidXML = false;\r
81                 resources = new ArrayList<String>();\r
82         }\r
83 \r
84         public List<String> getResources() {\r
85                 return resources;\r
86         }\r
87         public String convert(String noteGuid, String content) {\r
88                 logger.log(logger.HIGH, "Entering DBRunner.convertToEnml");\r
89                 logger.log(logger.EXTREME, "Note Text:" +content);\r
90                 \r
91                 // Replace the en-note tags with body tags in case we came from \r
92                 // someplace other than the editor (for example, if we are merging notes).\r
93                 content = content.replace("<en-note>", "<body>");\r
94                 content = content.replace("</en-note>", "</body>");\r
95                 // Start removing stuff we don't need or want\r
96                 int br = content.lastIndexOf("</body>");\r
97                 if (br > 0)\r
98                         content = new String(content.substring(0,br));\r
99                 String newContent;\r
100                 int k = content.indexOf("<body");\r
101                 if (k>-1)\r
102                         newContent = new String(content.substring(k));\r
103                 else\r
104                         newContent = "<body>"+content;\r
105 \r
106                 \r
107                 // Check that we have a vaild header.  Normally we should not\r
108                 // but sometimes it seems that we can.  I don't see how, but it is\r
109                 // easy enough to check.\r
110                 if (!newContent.startsWith("<?xml"))\r
111                         newContent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" \r
112                                 +"<!DOCTYPE en-note SYSTEM \"http://xml.evernote.com/pub/enml2.dtd\">\n"\r
113                                 +newContent \r
114                                 +"</body>";\r
115                 \r
116 \r
117                 // Fix the more common XML problems that Webkit creates, but are not considered \r
118                 // valid XML.\r
119                 newContent = fixStupidXMLProblems(newContent);\r
120                 \r
121                 \r
122                 // Change the contents to have enml instead of body tags or\r
123                 // we'll fail validation later.\r
124                 newContent = newContent.replace("<body", "<en-note");\r
125                 newContent = newContent.replace("</body>", "</en-note>");\r
126                 \r
127                 // First pass through the data.  The goal of this pass is to \r
128                 // validate that we have a good XML document and to repair\r
129                 // any problems found.\r
130                 \r
131                 XMLNoteRepair repair = new XMLNoteRepair();\r
132 //              logger.log(logger.HIGH, "Checking XML Structure");\r
133 //              newContent = repair.parse(newContent, false);\r
134 //              logger.log(logger.HIGH, "Check complete");\r
135         \r
136         logger.log(logger.HIGH, "Fixing encryption tags");\r
137         newContent = fixEncryptionTags(newContent);\r
138                 \r
139                 Tidy tidy = new Tidy();\r
140                 TidyListener tidyListener = new TidyListener(logger);\r
141                 tidy.setMessageListener(tidyListener);\r
142                 tidy.getStderr().close();  // the listener will capture messages\r
143                 tidy.setXmlTags(true);\r
144                 \r
145                 QTextCodec codec;\r
146                 codec = QTextCodec.codecForName("UTF-8");\r
147         QByteArray unicode =  codec.fromUnicode(newContent);\r
148         \r
149 //              byte html[] = newContent.getBytes();\r
150 //              ByteArrayInputStream is = new ByteArrayInputStream(html);\r
151         logger.log(logger.HIGH, "Starting JTidy check");\r
152         logger.log(logger.EXTREME, "Start of JTidy Input");\r
153         logger.log(logger.EXTREME, newContent);\r
154         logger.log(logger.EXTREME, "End Of JTidy Input");\r
155                 ByteArrayInputStream is = new ByteArrayInputStream(unicode.toByteArray());\r
156         ByteArrayOutputStream os = new ByteArrayOutputStream();\r
157         tidy.setInputEncoding("UTF-8");\r
158                 tidy.parse(is, os);\r
159                 String tidyContent = os.toString();\r
160                 if (tidyListener.errorFound) {\r
161                         logger.log(logger.LOW, "Note Contents Begin");\r
162                         logger.log(logger.LOW, content);\r
163                         logger.log(logger.LOW, "Note Contents End");\r
164                         tidyContent = null;\r
165                 } else {\r
166                         if (newContent.trim().equals(""))\r
167                                 tidyContent = null;\r
168                 }\r
169 \r
170                 // If the repair above returned null, then the XML is foobar.\r
171                 // We are done here.\r
172                 if (tidyContent != null) {\r
173                         newContent = tidyContent;\r
174                 } else {\r
175                         // Houston, we've had a problem.  Fall back to old method\r
176                         logger.log(logger.HIGH, "Error converting to JTidy.  Falling back to old method");\r
177                         String repairedContent = repair.parse(newContent, false);\r
178                         if (repairedContent == null) {\r
179                                 logger.log(logger.EXTREME, "Null returned from repair.parse()");\r
180                                 logger.log(logger.LOW, "Parse error when converting to ENML. Aborting save");\r
181                                 return null;\r
182                         }\r
183                         newContent = repairedContent;\r
184                         logger.log(logger.EXTREME, "Start of repaired content");\r
185                         logger.log(logger.EXTREME, repairedContent);\r
186                         logger.log(logger.EXTREME, "End of repaired content");\r
187                 }\r
188                 \r
189                 // Second pass through the data.  The goal of this pass is to \r
190                 // remove any things we added in NeverNote that do not match\r
191                 // the ENML schema\r
192                 XMLCleanup v = new XMLCleanup();\r
193                 v.setValue(newContent);\r
194                 logger.log(logger.HIGH, "Beginning ENML Cleanup");\r
195                 v.validate();\r
196                 logger.log(logger.HIGH, "Cleanup complete.");\r
197                 \r
198         \r
199                         \r
200                 // Final pass through the data.  In this one we\r
201                 // remove any invalid attributes and to save the\r
202                 // new resources.\r
203                 logger.log(logger.EXTREME, "Rebuilt ENML:");\r
204                 logger.log(logger.EXTREME, v.getValue());       \r
205                 logger.log(logger.EXTREME, "End Of Rebuilt ENML:");\r
206                 resources = v.getResources();\r
207 \r
208                 \r
209                 // The XML has the dtd to validate set against Evernote's web\r
210                 // address.  We change it to a local one because otherwise it would\r
211                 // fail if the user doesn't have internet connectivity.  The local copy\r
212                 // also contains the 3 other PUBLIC definitions at the beginning of the dtd.\r
213                 newContent = v.getValue();\r
214                 File dtdFile = Global.getFileManager().getXMLDirFile("enml2.dtd");\r
215                 String dtd = dtdFile.toURI().toString();\r
216                 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \'http://xml.evernote.com/pub/enml2.dtd'>", \r
217                                 "<!DOCTYPE en-note SYSTEM \"" +dtd +"\">");\r
218                 \r
219                 logger.log(logger.HIGH, "Validating ENML");\r
220                 String repairedContent = repair.parse(newContent, true);\r
221                 if (repairedContent == null)\r
222                         logger.log(logger.EXTREME, "Null returned from repair.parse()");\r
223                 else\r
224                         newContent = repairedContent;\r
225                 logger.log(logger.HIGH, "Validation complete");\r
226                 saveInvalidXML = repair.saveInvalidXML;\r
227                 \r
228                 // Restore the correct XML header.\r
229                 newContent = newContent.replace("<!DOCTYPE en-note SYSTEM \"" +dtd +"\">", \r
230                                 "<!DOCTYPE en-note SYSTEM 'http://xml.evernote.com/pub/enml2.dtd'>");\r
231                 \r
232                 \r
233                 logger.log(logger.EXTREME, "Leaving ENMLConverter.convert()");\r
234                 return newContent;\r
235         }\r
236 \r
237         \r
238         private String fixEncryptionTags(String content) {\r
239                 // Fix the problem that the document body isn't properly closed\r
240                 String newContent = new String(content);\r
241                 logger.log(logger.MEDIUM, "Inside EnmlConverter.fixEncryptionTags");\r
242                 logger.log(logger.EXTREME, content);\r
243                 \r
244                 // Fix the problem that the img tag isn't properly closed\r
245                 int endPos, startPos, endData,slotStart, slotEnd;\r
246                 logger.log(logger.MEDIUM, "Checking table encryption tags");\r
247                 String eTag = "<table class=\"en-crypt-temp\"";\r
248                 for (int i=newContent.indexOf(eTag); i>0; i = newContent.indexOf(eTag,i+1)) {\r
249                         slotStart = newContent.indexOf("slot", i+1)+6;\r
250                         slotEnd = newContent.indexOf("\"",slotStart);\r
251                         String slot = newContent.substring(slotStart, slotEnd);\r
252                         startPos = newContent.indexOf("<td>", i+1)+4;\r
253                         endData = newContent.indexOf("</td>",startPos);\r
254                         String text = newContent.substring(startPos,endData);\r
255                         endPos = newContent.indexOf("</table>",i+1)+8;\r
256                         // Encrypt the text\r
257                         Pair<String,String> pair = Global.passwordSafe.get(slot);\r
258                         String password = pair.getFirst();\r
259                         String hint = pair.getSecond();\r
260                         EnCrypt crypt = new EnCrypt(); \r
261                         String encrypted = crypt.encrypt(text, password, 64); \r
262 \r
263                         // replace the table with an en-crypt tag.\r
264                         newContent = newContent.substring(0,i-1) + \r
265                                 "<en-crypt-temp cipher=\"RC2\" length=\"64\" hint=\""+\r
266                                 hint +"\" value=\""+\r
267                                 encrypted +\r
268                                 "\" />" +\r
269                                 newContent.substring(endPos);\r
270                 }\r
271                 \r
272                 return newContent;\r
273         }\r
274         \r
275         // Fix XML problems that Qt can't deal with\r
276         public String fixStupidXMLProblems(String content) {\r
277                 logger.log(logger.HIGH, "Entering DBRunner.fixStupidXMLProblems");\r
278 \r
279                 // Fix the problem that the document body isn't properly closed\r
280                 String newContent = new String(content);\r
281                 logger.log(logger.MEDIUM, "Inside fixStupidXMLProblems.  Old content:");\r
282                 logger.log(logger.EXTREME, content);\r
283                 \r
284                 // Fix the problem that the img tag isn't properly closed\r
285                 int endPos;\r
286                 logger.log(logger.MEDIUM, "Checking img tags");\r
287                 for (int i=newContent.indexOf("<img"); i>0; i = newContent.indexOf("<img",i+1)) {\r
288                         endPos = newContent.indexOf(">",i+1);\r
289                         String end = newContent.substring(endPos+1);\r
290                         newContent = newContent.subSequence(0,endPos) +"/>"+end;\r
291                 }\r
292                 \r
293                 // Fix the problem that the input tag isn't properly closed\r
294                 logger.log(logger.MEDIUM, "Checking input tags");\r
295                 for (int i=newContent.indexOf("<input"); i>0; i = newContent.indexOf("<input",i+1)) {\r
296                         endPos = newContent.indexOf(">",i+1);\r
297                         String end = newContent.substring(endPos+1);\r
298                         newContent = newContent.subSequence(0,endPos) +"/>"+end;\r
299                 }\r
300                 \r
301                 \r
302                 // Fix the problem that the <br> tag isn't properly closed\r
303                 logger.log(logger.MEDIUM, "Checking br tags");\r
304                 for (int i=newContent.indexOf("<br"); i>0; i = newContent.indexOf("<br",i+1)) {\r
305                         endPos = newContent.indexOf(">",i+1);\r
306                         String end = newContent.substring(endPos+1);\r
307                         newContent = newContent.subSequence(0,endPos) +"/>"+end;\r
308                 }\r
309                         \r
310                 // Fix the problem that the <hr> tag isn't properly closed\r
311                 logger.log(logger.MEDIUM, "Checking hr tags");\r
312                 for (int i=newContent.indexOf("<hr"); i>0; i = newContent.indexOf("<hr",i+1)) {\r
313                         endPos = newContent.indexOf(">",i+1);\r
314                         String end = newContent.substring(endPos+1);\r
315                         newContent = newContent.subSequence(0,endPos) +"/>"+end;\r
316                 }\r
317                 \r
318                 logger.log(logger.MEDIUM, "Leaving fixStupidXMLProblems");\r
319                 logger.log(logger.HIGH, "Leaving DBRunner.fixStupidXMLProblems");\r
320                 return newContent.toString();\r
321         }\r
322 \r
323 \r
324         // Fix XML that Evernote thinks is invalid\r
325         public String fixEnXMLCrap(String note) {\r
326                 logger.log(logger.EXTREME, "Entering EnmlConverter.fixEnXMLCrap");\r
327                 if (note == null)\r
328                         return null;\r
329                 \r
330                 int pos;\r
331                 StringBuffer buffer = new StringBuffer(note);\r
332                 \r
333                 logger.log(logger.EXTREME, "Converting <b/>");\r
334                 // change all <b/> to <b></b> because Evernote hates them if they happen in <span>\r
335                 pos = buffer.indexOf("<b/>");\r
336                 for (; pos>-1; ) {\r
337                         buffer.replace(pos, pos+4, "<b></b>");\r
338                         pos = buffer.indexOf("<b/>",pos);\r
339                 }\r
340                 // change all <br/> to <br></br> because Evernote hates them if they happen in <span>\r
341                 logger.log(logger.EXTREME, "converting <br/>");\r
342                 pos = buffer.indexOf("<br/>");\r
343                 for (; pos>-1; ) {\r
344                         buffer.replace(pos, pos+5, "<br></br>");\r
345                         pos = buffer.indexOf("<br/>",pos);\r
346                 }\r
347                 \r
348                 // change all <span> elements in lists because Evernote hates them if they happen \r
349                 int endPos = 0;\r
350                 int spanPos;\r
351                 pos = buffer.indexOf("<li>");\r
352                 spanPos = buffer.indexOf("<span>");\r
353                 // Get rid of empty spans in <li> elements\r
354                 pos = buffer.indexOf("<li>");\r
355                 spanPos = buffer.indexOf("<span/>");\r
356                 for (; pos>-1 && spanPos >-1;) {\r
357                         endPos = buffer.indexOf("</li>",pos);\r
358                         if (spanPos > pos && spanPos < endPos) {\r
359                                 buffer.replace(spanPos,spanPos+7,"");\r
360                         }\r
361                         pos=buffer.indexOf("<li>",pos+1);\r
362                         spanPos = buffer.indexOf("<span/>",spanPos);\r
363                 }\r
364                 \r
365                 logger.log(logger.EXTREME, "Leaving EnmlConverter.fixEnXMLCrap");\r
366                 return buffer.toString();\r
367         }\r
368         \r
369         // Fix stupid en-media problems\r
370         public String fixEnMediaCrap(String note) {\r
371                 if (note == null)\r
372                         return null;\r
373                 \r
374                 StringBuffer buffer = new StringBuffer(note);\r
375                 // get rid of any </en-media> tags since they shouldn't exist.\r
376                 int pos = buffer.indexOf("</en-media>");\r
377                 for (; pos>-1; ) {\r
378                         buffer.replace(pos, pos+11, "");\r
379                         pos = buffer.indexOf("</en-media>",pos);\r
380                 }\r
381                 \r
382                 \r
383                 // Make sure we have a proper /> ending the en-media tag\r
384                 pos = buffer.indexOf("<en-media");\r
385                 for (; pos>-1; ) {\r
386                         pos=buffer.indexOf(">", pos);\r
387                         if (!buffer.substring(pos-1,pos).equals("/"))\r
388                         buffer.replace(pos, pos+1, " />");\r
389                         pos = buffer.indexOf("<en-media",pos);\r
390                 }\r
391                 \r
392                 return buffer.toString();\r
393         }\r
394 }\r