--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""fetch url"""
+
+import os.path
+import sys
+import re
+import os
+import urlparse
+
+import spyder
+
+
+class Fetcher(spyder.Spyder):
+ """Web Spider"""
+ def __init__(self, start_url, filter, output_dir):
+ """url needs complete url like 'http://hogehoge.jp/foo/bar/boo'."""
+ spyder.Spyder.__init__(self)
+ self._url = start_url
+ self._output_dir = output_dir
+ self._url_rex = re.compile(filter)
+ self._url_rex_img = re.compile(r".*(\.png|\.jpg|\.gif|\.txt)$")
+ self.append_url(start_url)
+
+ def _url_to_pathname(self, url):
+ m = self._url_rex.search(url)
+ if m:
+ s = m.group(1)
+ if len(s) == 0:
+ return ("", "index.html")
+ elif s[-1] == "/":
+ return (s[0:-1], "index.html")
+ else:
+ spl = s.rsplit("/", 1)
+ if len(spl) > 1:
+ return (spl[0], spl[1])
+ else:
+ return ("", spl[0])
+ else:
+ return (None, None)
+
+ def _save_attachment(self, url):
+ """save url as attachment."""
+ print >> sys.stderr, "save %s ..." % url
+ data = self.grab_by_get(url)
+ (dirname, filename) = self._url_to_pathname(url)
+ output_dir = os.path.join(self._output_dir, dirname)
+ output_file = os.path.join(output_dir, filename)
+ self.prepare_output_dir(output_dir)
+
+ print >> sys.stderr, "output to %s ." % (output_file)
+ f = open(output_file, "w")
+ f.write(data)
+ f.close()
+
+ def handle_url(self, url):
+ """check url should be traced or not. if trace, return True. Normally, you should override this function."""
+
+ if self._url_rex.search(url):
+ if self._url_rex_img.search(url):
+ self._save_attachment(url)
+ else:
+ return True
+ return False
+
+ def handle_start_fetch(self, url):
+ """this function is called when start to fetch url."""
+ print >> sys.stderr, "fetch %s ..." % (url)
+
+ def handle_data(self, url, level, data):
+ """this function is called when data grabbed."""
+ data = self.grab_by_get(url)
+ (dirname, filename) = self._url_to_pathname(url)
+ output_dir = os.path.join(self._output_dir, dirname)
+ output_file = os.path.join(output_dir, filename)
+ self.prepare_output_dir(output_dir)
+
+ print >> sys.stderr, "output to %s ." % (output_file)
+ try:
+ f = open(output_file, "w")
+ except IOError:
+ print >> sys.stderr, "error: cannot open file: %s . skip..."
+ return
+ f.write(data)
+ f.close()
+
+ # get attachments
+ for src in self.extract_imgs(data, url):
+ if self._url_rex_img.search(src) and self._url_rex.search(src):
+ self._save_attachment(src)
+
+ def prepare_output_dir(self, dir):
+ if not os.path.exists(dir):
+ try:
+ os.makedirs(dir)
+ except Exception:
+ print >> sys.stderr, "exception: cannot make directory %s ." % dir
+ raise Exception