OSDN Git Service

create html2wiki, fix problem in spyder.py in url's fragments process
authorhylom <hylom@users.sourceforge.jp>
Fri, 8 Jan 2010 10:18:29 +0000 (19:18 +0900)
committerhylom <hylom@users.sourceforge.jp>
Fri, 8 Jan 2010 10:18:29 +0000 (19:18 +0900)
html2wiki/fetch_html.py [new file with mode: 0755]
html2wiki/fetcher.py [new file with mode: 0644]

diff --git a/html2wiki/fetch_html.py b/html2wiki/fetch_html.py
new file mode 100755 (executable)
index 0000000..6aeca65
--- /dev/null
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import fetcher
+import sys
+import os.path
+
+usage = "%s <start_url> <url_filter> <output_dir>" % sys.argv[0]
+
+try:
+    start = sys.argv[1]
+    filter = sys.argv[2]
+    output_dir = sys.argv[3]
+except IndexError:
+    sys.exit(usage)
+
+if not os.path.isdir(output_dir):
+    sys.exit(usage)
+
+f = fetcher.Fetcher(start, filter, output_dir)
+f.run()
+
diff --git a/html2wiki/fetcher.py b/html2wiki/fetcher.py
new file mode 100644 (file)
index 0000000..29fc03c
--- /dev/null
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""fetch url"""
+
+import os.path
+import sys
+import re
+import os
+import urlparse
+
+import spyder
+
+
+class Fetcher(spyder.Spyder):
+    """Web Spider"""
+    def __init__(self, start_url, filter, output_dir):
+        """url needs complete url like 'http://hogehoge.jp/foo/bar/boo'."""
+        spyder.Spyder.__init__(self)
+        self._url = start_url
+        self._output_dir = output_dir
+        self._url_rex = re.compile(filter)
+        self._url_rex_img = re.compile(r".*(\.png|\.jpg|\.gif|\.txt)$")
+        self.append_url(start_url)
+
+    def _url_to_pathname(self, url):
+        m = self._url_rex.search(url)
+        if m:
+            s = m.group(1)
+            if len(s) == 0:
+                return ("", "index.html")
+            elif s[-1] == "/":
+                return (s[0:-1], "index.html")
+            else:
+                spl = s.rsplit("/", 1)
+                if len(spl) > 1:
+                    return (spl[0], spl[1])
+                else:
+                    return ("", spl[0])
+        else:
+            return (None, None)
+
+    def _save_attachment(self, url):
+        """save url as attachment."""
+        print >> sys.stderr, "save %s ..." % url
+        data = self.grab_by_get(url)
+        (dirname, filename) = self._url_to_pathname(url)
+        output_dir = os.path.join(self._output_dir, dirname)
+        output_file = os.path.join(output_dir, filename)
+        self.prepare_output_dir(output_dir)
+        
+        print >> sys.stderr, "output to %s ." % (output_file)
+        f = open(output_file, "w")
+        f.write(data)
+        f.close()
+
+    def handle_url(self, url):
+        """check url should be traced or not. if trace, return True. Normally, you should override this function."""
+
+        if self._url_rex.search(url):
+            if self._url_rex_img.search(url):
+                self._save_attachment(url)
+            else:
+                return True
+        return False
+
+    def handle_start_fetch(self, url):
+        """this function is called when start to fetch url."""
+        print >> sys.stderr, "fetch %s ..." % (url)
+
+    def handle_data(self, url, level, data):
+        """this function is called when data grabbed."""
+        data = self.grab_by_get(url)
+        (dirname, filename) = self._url_to_pathname(url)
+        output_dir = os.path.join(self._output_dir, dirname)
+        output_file = os.path.join(output_dir, filename)
+        self.prepare_output_dir(output_dir)
+
+        print >> sys.stderr, "output to %s ." % (output_file)
+        try:
+            f = open(output_file, "w")
+        except IOError:
+            print >> sys.stderr, "error: cannot open file: %s . skip..."
+            return
+        f.write(data)
+        f.close()
+
+        # get attachments
+        for src in self.extract_imgs(data, url):
+            if self._url_rex_img.search(src) and self._url_rex.search(src):
+                self._save_attachment(src)
+
+    def prepare_output_dir(self, dir):
+        if not os.path.exists(dir):
+            try:
+                os.makedirs(dir)
+            except Exception:
+                print >> sys.stderr, "exception: cannot make directory %s ." % dir
+                raise Exception