samples/web_spider.py

   1 #!/usr/bin/python
   2 # --------------------------------------------------------------
   3 # Simple script sample.  Eacch task just sleeps for some portion
   4 # of a second
   5 # --------------------------------------------------------------
   6
   7 import random, time, sys, re
   8 import urllib2
   9 from HTMLParser import HTMLParser
  10 from constrictor.task import Task
  11 from constrictor.script import Script, ScriptManager
  12 from constrictor.properties import Properties
  13 from constrictor.log import *
  14
  15
  16 class PageFetchTask(Task):
  17     def __init__(self, spider, name=None):
  18         Task.__init__(self, name)
  19         self.spider = spider
  20
  21     def run(self):
  22         # fetch a single page
  23         return self.spider.fetch_url()
  24
  25 class WebSpiderScript(Script):
  26
  27     # Heavily modified version of the script found at
  28     # http://www.halotis.com/2009/09/16/python-web-crawler-script/
  29
  30     class Spider(HTMLParser):
  31
  32         def __init__(self, url, max_visits, limit_paths, allowed_hosts=[]):
  33
  34             HTMLParser.__init__(self)
  35             self.url = url
  36             self.db = {self.url: 1}
  37             self.url_list = [self.url]
  38             self.max_visits = max_visits
  39             self.allowed_hosts = allowed_hosts
  40             proto, self.host, path = self.url_parts(url)
  41             self.limit_paths = limit_paths
  42
  43             try:
  44                 foo = self.allowed_hosts.index(self.host)
  45             except ValueError:
  46                 self.allowed_hosts.append(self.host)
  47
  48         def url_parts(self, url):
  49             proto = ''
  50             host = ''
  51             path = ''
  52
  53             res = re.search('^(https?)://([^\/]+)(.*)', url)
  54             try:
  55                 proto = res.group(1)
  56                 host = res.group(2)
  57             except IndexError:
  58                 raise Exception("Invalid URL: %s" % url)
  59             try:
  60                 path = res.group(3)
  61             except IndexError:
  62                 pass
  63
  64             return proto, host, path
  65
  66
  67         def handle_starttag(self, tag, attrs):
  68
  69             if tag == 'a' and attrs:
  70
  71
  72                 link = [h for h in attrs if h[0] == 'href']
  73                 if len(link) == 0: return
  74                 link = link[0][1]
  75
  76                 if link[:4] != "http":
  77                     proto, host, path = self.url_parts(self.url)
  78
  79                     # Ignore href=javascript:foo and page anchors
  80                     if link[:11] == 'javascript:' or link[:1] == '#':
  81                         return
  82
  83                     if link[:1] == '/': # full path
  84                         path = link
  85
  86                     elif link[:1] == '?': # GET params only
  87                         res = re.match('(.*)\?.*', path)
  88                         path = "%s%s" % (res.group(1), link)
  89
  90                     else: # relative path
  91                         parts = path.split('/')
  92                         path = path.replace(parts[-1:][0], link)
  93
  94                     link = "%s://%s%s" % (proto, host, path)
  95
  96                 if link not in self.db:
  97                     self.url_list.append(link)
  98
  99                 self.db[link] = (self.db.get(link) or 0) + 1
 100
 101         def fetch_url(self):
 102             req = urllib2.urlopen(self.url)
 103             return req.read()
 104
 105         def crawl(self):
 106
 107             visited = 0
 108
 109             for self.url in self.url_list:
 110
 111                 if visited > self.max_visits: break
 112
 113                 log_debug("Visited %d URLs" % visited)
 114
 115                 proto, host, path = self.url_parts(self.url)
 116
 117                 try:
 118                     self.allowed_hosts.index(host)
 119                 except ValueError:
 120                     log_info("Skipping remote host %s..." % host)
 121                     continue
 122
 123                 valid = False;
 124                 for lpath in self.limit_paths:
 125                     if path[:len(lpath)] == lpath:
 126                         valid = True
 127                         break
 128
 129                 if not valid:
 130                     log_info("Skipping forbidden base path %s..." % path)
 131                     continue
 132
 133                 try:
 134                     log_info("Opening URL %s" % self.url)
 135                     res = PageFetchTask(self).start()
 136                     self.reset()
 137                     self.feed(res)
 138                     visited += 1
 139                 except:
 140                     self.reset()
 141
 142             log_info("Found %d distinct URLs" % len(self.db.keys()))
 143
 144     def run(self):
 145
 146         props = Properties.get_properties()
 147         start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
 148         max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
 149         limit_paths = props.get_property('constrictor.plugin.web_spider.limit_paths')
 150         limit_paths = limit_paths.split(',')
 151
 152         if not start_url or not max_pages:
 153             log_error("Missing required properties: " +
 154                 "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
 155             return False
 156
 157         spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_paths)
 158         result = spider.crawl()
 159
 160         return True
 161
 162 # Launch the script
 163 ScriptManager.go(WebSpiderScript())
 164
 165