From 1fbcb81faec60b212c97cabffd974100decddfbc Mon Sep 17 00:00:00 2001 From: Lacerum Date: Mon, 18 Apr 2022 11:54:47 -0700 Subject: [PATCH] forgot to add robot check in is_valid --- spacetime-crawler4py-master/scraper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 89ba22c..9518209 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -39,7 +39,7 @@ def is_a_loop_trap(url): return False # Tests to see if the url is ok to be crawled by checking against the robots.txt -# file. It does so by checking the URL or URL prefixes +# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser # http://pymotw.com/2/robotparser/ def robots_ok(baseurl): @@ -129,6 +129,8 @@ def is_valid(url): return False elif is_a_loop_trap(url): return False + elif not robots_ok(url): + return False else: return True