From 4cd19c664e6b9aee862a16ed64d2f99c84a21e3d Mon Sep 17 00:00:00 2001 From: erickson Date: Thu, 21 Apr 2011 14:40:46 +0000 Subject: [PATCH] allow for a list of limit_path options for tighter control on which resources to crawl git-svn-id: svn://svn.open-ils.org/ILS-Contrib/constrictor/trunk@1411 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- constrictor.properties | 2 +- samples/web_spider.py | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/constrictor.properties b/constrictor.properties index 378c9a46a..54b24ccbf 100644 --- a/constrictor.properties +++ b/constrictor.properties @@ -42,7 +42,7 @@ constrictor.plugin.web_spider.start_url=http://example.org/somepath?foo=bar,http constrictor.plugin.web_spider.max_pages=100 # Only allow the spider to fetch pages with a certain base path -constrictor.plugin.web_spider.limit_path=/somepath +constrictor.plugin.web_spider.limit_paths=/somepath,/otherpath diff --git a/samples/web_spider.py b/samples/web_spider.py index 0c82b4270..5c82fa9e7 100755 --- a/samples/web_spider.py +++ b/samples/web_spider.py @@ -29,7 +29,7 @@ class WebSpiderScript(Script): class Spider(HTMLParser): - def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]): + def __init__(self, url, max_visits, limit_paths, allowed_hosts=[]): HTMLParser.__init__(self) self.url = url @@ -38,7 +38,7 @@ class WebSpiderScript(Script): self.max_visits = max_visits self.allowed_hosts = allowed_hosts proto, self.host, path = self.url_parts(url) - self.limit_path = limit_path + self.limit_paths = limit_paths try: foo = self.allowed_hosts.index(self.host) @@ -120,10 +120,15 @@ class WebSpiderScript(Script): log_info("Skipping remote host %s..." % host) continue - if self.limit_path: - if path[:len(self.limit_path)] != self.limit_path: - log_info("Skipping forbidden base path %s..." % path) - continue + valid = False; + for lpath in self.limit_paths: + if path[:len(lpath)] == lpath: + valid = True + break + + if not valid: + log_info("Skipping forbidden base path %s..." % path) + continue try: log_info("Opening URL %s" % self.url) @@ -141,14 +146,15 @@ class WebSpiderScript(Script): props = Properties.get_properties() start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url') max_pages = props.get_property('constrictor.plugin.web_spider.max_pages') - limit_path = props.get_property('constrictor.plugin.web_spider.limit_path') + limit_paths = props.get_property('constrictor.plugin.web_spider.limit_paths') + limit_paths = limit_paths.split(',') if not start_url or not max_pages: log_error("Missing required properties: " + "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages") return False - spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path) + spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_paths) result = spider.crawl() return True -- 2.43.2