create html2wiki

[otptools/otptools.git] / html2wiki / fetch_sfjpmag.py
diff --git a/html2wiki/fetch_sfjpmag.py b/html2wiki/fetch_sfjpmag.py

new file mode 100755 (executable)

index 0000000..bf4fa43
--- /dev/null
+++ b/html2wiki/fetch_sfjpmag.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""fetch url"""
+
+import os.path
+import sys
+import re
+import os
+import urlparse
+
+import spyder
+
+
+class SfjpMagFetcher(spyder.Spyder):
+    """Web Spider for SourceForge.JP Magazine."""
+    def __init__(self, url, output_dir):
+        """url needs complete url like 'http://hogehoge.jp/foo/bar/boo'."""
+        spyder.Spyder.__init__(self)
+        self._url = url
+        self._output_dir = output_dir
+        self._url_rex = re.compile("^" + url + r"([/0-9]*)$")
+        self._url_rex_img = re.compile(r"^http://static.sourceforge.jp/magazine/blob/.*$")
+        self.append_url(url)
+
+    def handle_url(self, url):
+        """check url should be traced or not. if trace, return True. Normally, you should override this function."""
+
+        if self._url_rex.search(url):
+            return True
+        if self._url_rex_img.search(url):
+            self._save_attachment(url)
+        return False
+
+    def _save_attachment(self, url):
+        """save url as attachment."""
+        t = urlparse.urlparse(url)  # scheme://netloc/path;parameters?query#fragment
+        filename = t.path.split("/")[-1]
+        data = self.grab_by_get(url)
+        output_dir = self._url_to_path(self.current_url())
+        output_file = os.path.join(output_dir, filename)
+
+        self.prepare_output_dir(output_dir)
+        
+        print >> sys.stderr, "output to %s ." % (output_file)
+        f = open(output_file, "w")
+        f.write(data)
+        f.close()
+
+    def handle_start_fetch(self, url):
+        """this function is called when start to fetch url."""
+        print >> sys.stderr, "fetch %s ..." % (url)
+
+    def _url_to_path(self, url):
+        m = self._url_rex.search(url)
+        if m:  # data is html:
+            page = m.group(1)
+            if page == "":
+                pagenum = 1
+            elif page[0] == "/":
+                pagenum = page[1:]
+            story_id = url.replace("http://sourceforge.jp/magazine/", "")
+
+            if pagenum == 1:
+                rel_dir = story_id + "/1"
+            else:
+                rel_dir = story_id
+            return os.path.join(self._output_dir, rel_dir)
+        else:
+            # something wrong!
+            return None
+
+    def handle_data(self, url, level, data):
+        """this function is called when data grabbed."""
+        output_dir = self._url_to_path(url)
+        output_file = os.path.join(output_dir, "body.html")
+        self.prepare_output_dir(output_dir)
+
+        print >> sys.stderr, "output to %s ." % (output_file)
+        f = open(output_file, "w")
+        f.write(data)
+        f.close()
+
+        # get attachments
+        for src in self.extract_imgs(data, url):
+            if self._url_rex_img.search(src):
+                self._save_attachment(src)
+
+    def prepare_output_dir(self, dir):
+        if not os.path.exists(dir):
+            try:
+                os.makedirs(dir)
+            except error:
+                pass
+
+
+
+def fetch(url, output_dir):
+    """Fetch SourceForge.JP Magazine's story selected by url with keep paging"""
+    f = SfjpMagFetcher(url, output_dir)
+    f.run()
+