]> git.evergreen-ils.org Git - working/random.git/blob - samples/web_spider.py
allow for a list of limit_path options for tighter control on which resources to...
[working/random.git] / samples / web_spider.py
1 #!/usr/bin/python
2 # --------------------------------------------------------------
3 # Simple script sample.  Eacch task just sleeps for some portion 
4 # of a second
5 # --------------------------------------------------------------
6
7 import random, time, sys, re
8 import urllib2
9 from HTMLParser import HTMLParser 
10 from constrictor.task import Task
11 from constrictor.script import Script, ScriptManager
12 from constrictor.properties import Properties
13 from constrictor.log import *
14
15
16 class PageFetchTask(Task):
17     def __init__(self, spider, name=None):
18         Task.__init__(self, name)
19         self.spider = spider
20
21     def run(self):
22         # fetch a single page
23         return self.spider.fetch_url()
24
25 class WebSpiderScript(Script):
26
27     # Heavily modified version of the script found at 
28     # http://www.halotis.com/2009/09/16/python-web-crawler-script/
29
30     class Spider(HTMLParser): 
31
32         def __init__(self, url, max_visits, limit_paths, allowed_hosts=[]): 
33
34             HTMLParser.__init__(self) 
35             self.url = url
36             self.db = {self.url: 1} 
37             self.url_list = [self.url]   
38             self.max_visits = max_visits
39             self.allowed_hosts = allowed_hosts
40             proto, self.host, path = self.url_parts(url)
41             self.limit_paths = limit_paths
42
43             try:
44                 foo = self.allowed_hosts.index(self.host)
45             except ValueError:
46                 self.allowed_hosts.append(self.host)
47
48         def url_parts(self, url):
49             proto = ''
50             host = ''
51             path = ''
52
53             res = re.search('^(https?)://([^\/]+)(.*)', url)
54             try:
55                 proto = res.group(1)
56                 host = res.group(2)
57             except IndexError:
58                 raise Exception("Invalid URL: %s" % url) 
59             try:
60                 path = res.group(3)
61             except IndexError:
62                 pass
63
64             return proto, host, path
65
66             
67         def handle_starttag(self, tag, attrs): 
68
69             if tag == 'a' and attrs: 
70
71
72                 link = [h for h in attrs if h[0] == 'href']
73                 if len(link) == 0: return
74                 link = link[0][1] 
75
76                 if link[:4] != "http": 
77                     proto, host, path = self.url_parts(self.url)
78
79                     # Ignore href=javascript:foo and page anchors
80                     if link[:11] == 'javascript:' or link[:1] == '#':
81                         return
82
83                     if link[:1] == '/': # full path
84                         path = link
85
86                     elif link[:1] == '?': # GET params only
87                         res = re.match('(.*)\?.*', path)
88                         path = "%s%s" % (res.group(1), link)
89
90                     else: # relative path
91                         parts = path.split('/')
92                         path = path.replace(parts[-1:][0], link)
93
94                     link = "%s://%s%s" % (proto, host, path)
95
96                 if link not in self.db: 
97                     self.url_list.append(link) 
98
99                 self.db[link] = (self.db.get(link) or 0) + 1   
100             
101         def fetch_url(self):
102             req = urllib2.urlopen(self.url) 
103             return req.read() 
104             
105         def crawl(self): 
106             
107             visited = 0
108
109             for self.url in self.url_list:
110
111                 if visited > self.max_visits: break
112
113                 log_debug("Visited %d URLs" % visited)
114
115                 proto, host, path = self.url_parts(self.url)
116
117                 try:
118                     self.allowed_hosts.index(host)
119                 except ValueError:
120                     log_info("Skipping remote host %s..." % host)
121                     continue
122
123                 valid = False;
124                 for lpath in self.limit_paths:
125                     if path[:len(lpath)] == lpath:
126                         valid = True
127                         break
128
129                 if not valid:
130                     log_info("Skipping forbidden base path %s..." % path)
131                     continue
132
133                 try: 
134                     log_info("Opening URL %s" % self.url)              
135                     res = PageFetchTask(self).start()
136                     self.reset() 
137                     self.feed(res) 
138                     visited += 1
139                 except: 
140                     self.reset() 
141             
142             log_info("Found %d distinct URLs" % len(self.db.keys()))
143
144     def run(self):
145
146         props = Properties.get_properties()
147         start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
148         max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
149         limit_paths = props.get_property('constrictor.plugin.web_spider.limit_paths')
150         limit_paths = limit_paths.split(',')
151
152         if not start_url or not max_pages:
153             log_error("Missing required properties: " +
154                 "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
155             return False
156         
157         spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_paths)
158         result = spider.crawl() 
159
160         return True
161
162 # Launch the script
163 ScriptManager.go(WebSpiderScript())
164
165