forgot to add robot check in is_valid

This commit is contained in:
Lacerum 2022-04-18 11:54:47 -07:00
parent 577fdb5a80
commit 1fbcb81fae

View File

@ -39,7 +39,7 @@ def is_a_loop_trap(url):
return False
# Tests to see if the url is ok to be crawled by checking against the robots.txt
# file. It does so by checking the URL or URL prefixes
# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
# http://pymotw.com/2/robotparser/
def robots_ok(baseurl):
@ -129,6 +129,8 @@ def is_valid(url):
return False
elif is_a_loop_trap(url):
return False
elif not robots_ok(url):
return False
else:
return True