OSDN Git Service

Add spyder.py and test_spyder.py
authorhylom <hylom@users.sourceforge.jp>
Fri, 16 Oct 2009 10:27:04 +0000 (19:27 +0900)
committerhylom <hylom@users.sourceforge.jp>
Fri, 16 Oct 2009 10:27:04 +0000 (19:27 +0900)
spyder.py [new file with mode: 0755]
test_spyder.py [new file with mode: 0755]

diff --git a/spyder.py b/spyder.py
new file mode 100755 (executable)
index 0000000..79b2f3b
--- /dev/null
+++ b/spyder.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-\r
+"""spyder.py -- tiny and customizable WWW spider.\r
+\r
+This module helps to fetch WWW contents. \r
+\r
+This module provides below classes and functions.\r
+\r
+Spyder -- Spyder base class.\r
+SpyderError -- Error class for exception process.\r
+"""\r
+\r
+__all__ = ["spyder"]\r
+__version__ = "0.10"\r
+\r
+import urllib\r
+import HTMLParser\r
+import os.path\r
+from urlparse import urlparse\r
+\r
+\r
+class SpyderError(Exception):\r
+    """Spyder's exception handler class."""\r
+    def __init__(self, name, value):\r
+        self.value = value\r
+        self.name = name\r
+\r
+    def __str__(self):\r
+        return self.name + ":" + repr(self.value)\r
+\r
+\r
+class AnchorParser(HTMLParser.HTMLParser):\r
+    """Parse HTML and extract A tag's link url.\r
+\r
+usage:\r
+    url = "http://hogehoge.net/foo/bar.html"\r
+    p = AnchorParser()\r
+    anchors = p.extract_anchors(html_string, url)\r
+"""\r
+\r
+    def __init__(self):\r
+        HTMLParser.HTMLParser.__init__(self)\r
+        self._anchors = []\r
+\r
+    def extract_anchors(self, html_string, url):\r
+        """Parse html_string with url, and return anchors"""\r
+        self._base_url_items = urlparse(url)\r
+        self.feed(html_string)\r
+        return self.anchors()\r
+\r
+    def anchors(self):\r
+        """return feed()'s result."""\r
+        return self._anchors\r
+\r
+    def handle_starttag(self, tag, attrs):\r
+        """starttag handler."""\r
+        if tag == "a":\r
+            for (attr, val) in attrs:\r
+                if attr == "href":\r
+                    self._anchors.append(self._regularize_url(val))\r
+                    break\r
+\r
+\r
+    def _regularize_url(self, url):\r
+        """regularize given url."""\r
+        # urlparse.urlparse("http://hoge.net/foo/var/index.html;q?a=b#c")\r
+        #\r
+        #       0       1           2                      3    4      5      \r
+        #  -> ('http', 'hoge.net', '/foo/var/index.html', 'q', 'a=b', 'c')\r
+        #\r
+        current_term = self._base_url_items\r
+        current_dir = os.path.dirname(current_term[2])\r
+        current_last = os.path.basename(current_term[2])\r
+\r
+        result = urlparse(url)\r
+        term = list(result)\r
+        \r
+        if not term[0]:\r
+            term[0] = current_term[0] + "://"\r
+        else:\r
+            term[0] = term[0] + "://"\r
+        if not term[1]:\r
+            term[1] = current_term[1]\r
+        if term[2] and term[2][0] != "/":\r
+            term[2] = os.path.normpath(current_dir + "/" + term[2])\r
+        if term[3]:\r
+            term[3] = ";" + term[3]\r
+        if term[4]:\r
+            term[4] = "?" + term[4]\r
+        if term[5]:\r
+            term[5] = "#" + term[5]\r
+\r
+        url = "".join(term)\r
+        return url\r
+\r
+\r
+class _DownloadQueue(object):\r
+    """Inner class"""\r
+\r
+    def __init__(self):\r
+        """constructor."""\r
+        self.init()\r
+\r
+    def init(self):\r
+        """clear queue"""\r
+        self.queue = []\r
+        self.map = {}\r
+\r
+    def append(self, url):\r
+        """append url to queue"""\r
+        if not self.map.has_key(url):\r
+            self.map[url] = 1\r
+            self.queue.append(url)\r
+\r
+    def pop(self):\r
+        """get queue's 1st item."""\r
+        self.map[url] = 0\r
+        return self.queue.pop(0)\r
+\r
+\r
+class Spyder(object):\r
+    """WWW Spider base class."""\r
+    version = __version__\r
+\r
+    def __init__(self):\r
+        """Constructor"""\r
+        self.queue = _DownloadQueue()\r
+\r
+    def append_url(self, url):\r
+        """append url to fetch."""\r
+        self.queue.append(url)\r
+\r
+    def handle_url(self, url):\r
+        """check url should be traced or not. if trace, return True. Normally, you should override this function."""\r
+        return False\r
+\r
+    def handle_start_fetch(self, url):\r
+        """this function is called when start to fetch url."""\r
+        pass\r
+\r
+    def handle_data(self, url, level, data):\r
+        """this function is called when data grabbed."""\r
+        pass\r
+\r
+    def run(self):\r
+        """Run grubbber"""\r
+        url = self.queue.pop()\r
+        while(url):\r
+            # get html from url\r
+            self.handle_start_fetch()\r
+            html = self.grab_by_get(url)\r
+            self.handle_data(html)\r
+\r
+            # extract links from html\r
+            anchors = self.extract_anchors(html, url)\r
+            for anchor in anchors:\r
+                if self.handle_url(anchor):\r
+                    self.queue.append(anchor)\r
+\r
+            # next\r
+            url = self.queue.pop()\r
+\r
+    def grab_by_get(self, url):\r
+        """grab given url's content  by GET method"""\r
+        u = urllib.urlopen(url)\r
+        data = u.read()\r
+        return data\r
+\r
+    def grab_by_post(self, url, params):\r
+        """grab given url's content  by POST method"""\r
+        encoded_params = urllib.urlencode(params)\r
+        u = urllib.urlopen(url, encoded_params)\r
+        data = u.read()\r
+        return data\r
+\r
+    def extract_anchors(self, html, url):\r
+        """extract link anchors from HTML"""\r
+        p = AnchorParser()\r
+        return  p.extract_anchors(html, url)\r
+\r
diff --git a/test_spyder.py b/test_spyder.py
new file mode 100755 (executable)
index 0000000..a5d6ec3
--- /dev/null
@@ -0,0 +1,75 @@
+#!/usr/bin/env python\r
+# -*- coding: utf-8 -*-\r
+"""Test suite for spyder.py."""\r
+\r
+import unittest\r
+import os.path\r
+import codecs\r
+\r
+import spyder\r
+\r
+\r
+class TestSequenceFunctions(unittest.TestCase):\r
+    def setUp(self):\r
+        self.test_html = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\r
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"\r
+ lang="en" dir="ltr">\r
+<head>\r
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\r
+  <title>Spyder test HTML</title>\r
+  </head>\r
+<body class='sidebar_inside_left'>\r
+<form action="" method="post" >\r
+<input type="hidden" name="sectok" value="d1edf02df0138c3a7307e24a371115e9" />\r
+<input type="hidden" name="id" value="home" />\r
+<input type="hidden" name="rev" value="" />\r
+<input type="hidden" name="date" value="1239099312" />\r
+<input type="hidden" name="prefix" value="" />\r
+<input type="hidden" name="suffix" value="" />\r
+<input type="hidden" name="changecheck" value="288b43989cd8dfa23319573916dfd16d" />\r
+<textarea name="wikitext" id="wiki__text" class="edit" cols="80" rows="10" tabindex="1" >\r
+This is text codes.\r
+foo bar hoge hoge\r
+</textarea>\r
+</form>\r
+<div class="footerinc">\r
+  <a  href="/wiki/feed.php" title="Recent changes RSS feed"><img src="/wiki/lib/tpl/sidebar/images/button-rss.png" width="80" height="15" alt="Recent changes RSS feed" /></a>\r
+        <a  href="http://creativecommons.org/licenses/by-nc-sa/3.0/" rel="license" title="CC Attribution-Noncommercial-Share Alike 3.0 Unported"><img src="/wiki/lib/images/license/button/cc-by-nc-sa.png" width="80" height="15" alt="" /></a>\r
+  <a  href="http://www.dokuwiki.org/donate" title="Donate"><img src="/wiki/lib/tpl/sidebar/images/button-donate.gif" alt="Donate" width="80" height="15" /></a>\r
+  <a  href="http://www.php.net" title="Powered by PHP"><img src="/wiki/lib/tpl/sidebar/images/button-php.gif" width="80" height="15" alt="Powered by PHP" /></a>\r
+  <a  href="http://validator.w3.org/check/referer" title="Valid XHTML 1.0"><img src="/wiki/lib/tpl/sidebar/images/button-xhtml.png" width="80" height="15" alt="Valid XHTML 1.0" /></a>\r
+  <a  href="http://jigsaw.w3.org/css-validator/check/referer?profile=css3" title="Valid CSS"><img src="/wiki/lib/tpl/sidebar/images/button-css.png" width="80" height="15" alt="Valid CSS" /></a>\r
+  <a  href="http://dokuwiki.org/" title="Driven by DokuWiki"><img src="/wiki/lib/tpl/sidebar/images/button-dw.png" width="80" height="15" alt="Driven by DokuWiki" /></a>\r
+  <a  href="hogehoge" title="hogehoge"><img src="/wiki/lib/tpl/sidebar/images/button-dw.png" width="80" height="15" alt="Driven by DokuWiki" /></a>\r
+</div>\r
+</body>\r
+</html>\r
+"""\r
+\r
+    def test_extract_anchors(self):\r
+        """test for AnchorParser"""\r
+\r
+        ret_ok = ["http://ffdshow-tryout.sourceforge.net/wiki/feed.php",\r
+                  "http://creativecommons.org/licenses/by-nc-sa/3.0/",\r
+                  "http://www.dokuwiki.org/donate",\r
+                  "http://www.php.net",\r
+                  "http://validator.w3.org/check/referer",\r
+                  "http://jigsaw.w3.org/css-validator/check/referer?profile=css3",\r
+                  "http://dokuwiki.org/",\r
+                  "http://ffdshow-tryout.sourceforge.net/wiki/hogehoge"]\r
+        \r
+        p = spyder.AnchorParser()\r
+        ret = p.extract_anchors(self.test_html, "http://ffdshow-tryout.sourceforge.net/wiki/home")\r
+        self.assertEqual(len(ret), len(ret_ok))\r
+        ret.sort()\r
+        ret_ok.sort()\r
+        for index in range(len(ret)):\r
+            self.assertEqual(ret[index], ret_ok[index])\r
+\r
+\r
+\r
+# do unittest\r
+suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceFunctions)\r
+unittest.TextTestRunner(verbosity=2).run(suite)\r
+\r