--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""fetch url"""
+
+import os.path
+import sys
+import re
+import os
+import urlparse
+
+import spyder
+
+
+class SfjpMagFetcher(spyder.Spyder):
+ """Web Spider for SourceForge.JP Magazine."""
+ def __init__(self, url, output_dir):
+ """url needs complete url like 'http://hogehoge.jp/foo/bar/boo'."""
+ spyder.Spyder.__init__(self)
+ self._url = url
+ self._output_dir = output_dir
+ self._url_rex = re.compile("^" + url + r"([/0-9]*)$")
+ self._url_rex_img = re.compile(r"^http://static.sourceforge.jp/magazine/blob/.*$")
+ self.append_url(url)
+
+ def handle_url(self, url):
+ """check url should be traced or not. if trace, return True. Normally, you should override this function."""
+
+ if self._url_rex.search(url):
+ return True
+ if self._url_rex_img.search(url):
+ self._save_attachment(url)
+ return False
+
+ def _save_attachment(self, url):
+ """save url as attachment."""
+ t = urlparse.urlparse(url) # scheme://netloc/path;parameters?query#fragment
+ filename = t.path.split("/")[-1]
+ data = self.grab_by_get(url)
+ output_dir = self._url_to_path(self.current_url())
+ output_file = os.path.join(output_dir, filename)
+
+ self.prepare_output_dir(output_dir)
+
+ print >> sys.stderr, "output to %s ." % (output_file)
+ f = open(output_file, "w")
+ f.write(data)
+ f.close()
+
+ def handle_start_fetch(self, url):
+ """this function is called when start to fetch url."""
+ print >> sys.stderr, "fetch %s ..." % (url)
+
+ def _url_to_path(self, url):
+ m = self._url_rex.search(url)
+ if m: # data is html:
+ page = m.group(1)
+ if page == "":
+ pagenum = 1
+ elif page[0] == "/":
+ pagenum = page[1:]
+ story_id = url.replace("http://sourceforge.jp/magazine/", "")
+
+ if pagenum == 1:
+ rel_dir = story_id + "/1"
+ else:
+ rel_dir = story_id
+ return os.path.join(self._output_dir, rel_dir)
+ else:
+ # something wrong!
+ return None
+
+ def handle_data(self, url, level, data):
+ """this function is called when data grabbed."""
+ output_dir = self._url_to_path(url)
+ output_file = os.path.join(output_dir, "body.html")
+ self.prepare_output_dir(output_dir)
+
+ print >> sys.stderr, "output to %s ." % (output_file)
+ f = open(output_file, "w")
+ f.write(data)
+ f.close()
+
+ # get attachments
+ for src in self.extract_imgs(data, url):
+ if self._url_rex_img.search(src):
+ self._save_attachment(src)
+
+ def prepare_output_dir(self, dir):
+ if not os.path.exists(dir):
+ try:
+ os.makedirs(dir)
+ except error:
+ pass
+
+
+
+def fetch(url, output_dir):
+ """Fetch SourceForge.JP Magazine's story selected by url with keep paging"""
+ f = SfjpMagFetcher(url, output_dir)
+ f.run()
+