2 # --------------------------------------------------------------
3 # Simple script sample. Eacch task just sleeps for some portion
5 # --------------------------------------------------------------
7 import random, time, sys, re
9 from HTMLParser import HTMLParser
10 from constrictor.task import Task
11 from constrictor.script import Script, ScriptManager
12 from constrictor.properties import Properties
13 from constrictor.log import *
16 class PageFetchTask(Task):
17 def __init__(self, spider, name=None):
18 Task.__init__(self, name)
23 return self.spider.fetch_url()
25 class WebSpiderScript(Script):
27 # Heavily modified version of the script found at
28 # http://www.halotis.com/2009/09/16/python-web-crawler-script/
30 class Spider(HTMLParser):
32 def __init__(self, url, max_visits, limit_paths, allowed_hosts=[]):
34 HTMLParser.__init__(self)
36 self.db = {self.url: 1}
37 self.url_list = [self.url]
38 self.max_visits = max_visits
39 self.allowed_hosts = allowed_hosts
40 proto, self.host, path = self.url_parts(url)
41 self.limit_paths = limit_paths
44 foo = self.allowed_hosts.index(self.host)
46 self.allowed_hosts.append(self.host)
48 def url_parts(self, url):
53 res = re.search('^(https?)://([^\/]+)(.*)', url)
58 raise Exception("Invalid URL: %s" % url)
64 return proto, host, path
67 def handle_starttag(self, tag, attrs):
69 if tag == 'a' and attrs:
72 link = [h for h in attrs if h[0] == 'href']
73 if len(link) == 0: return
76 if link[:4] != "http":
77 proto, host, path = self.url_parts(self.url)
79 # Ignore href=javascript:foo and page anchors
80 if link[:11] == 'javascript:' or link[:1] == '#':
83 if link[:1] == '/': # full path
86 elif link[:1] == '?': # GET params only
87 res = re.match('(.*)\?.*', path)
88 path = "%s%s" % (res.group(1), link)
91 parts = path.split('/')
92 path = path.replace(parts[-1:][0], link)
94 link = "%s://%s%s" % (proto, host, path)
96 if link not in self.db:
97 self.url_list.append(link)
99 self.db[link] = (self.db.get(link) or 0) + 1
102 req = urllib2.urlopen(self.url)
109 for self.url in self.url_list:
111 if visited > self.max_visits: break
113 log_debug("Visited %d URLs" % visited)
115 proto, host, path = self.url_parts(self.url)
118 self.allowed_hosts.index(host)
120 log_info("Skipping remote host %s..." % host)
124 for lpath in self.limit_paths:
125 if path[:len(lpath)] == lpath:
130 log_info("Skipping forbidden base path %s..." % path)
134 log_info("Opening URL %s" % self.url)
135 res = PageFetchTask(self).start()
142 log_info("Found %d distinct URLs" % len(self.db.keys()))
146 props = Properties.get_properties()
147 start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
148 max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
149 limit_paths = props.get_property('constrictor.plugin.web_spider.limit_paths')
150 limit_paths = limit_paths.split(',')
152 if not start_url or not max_pages:
153 log_error("Missing required properties: " +
154 "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
157 spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_paths)
158 result = spider.crawl()
163 ScriptManager.go(WebSpiderScript())