forgot to add robot check in is_valid
This commit is contained in:
parent
577fdb5a80
commit
1fbcb81fae
@ -39,7 +39,7 @@ def is_a_loop_trap(url):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
# Tests to see if the url is ok to be crawled by checking against the robots.txt
|
# Tests to see if the url is ok to be crawled by checking against the robots.txt
|
||||||
# file. It does so by checking the URL or URL prefixes
|
# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled
|
||||||
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
|
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
|
||||||
# http://pymotw.com/2/robotparser/
|
# http://pymotw.com/2/robotparser/
|
||||||
def robots_ok(baseurl):
|
def robots_ok(baseurl):
|
||||||
@ -129,6 +129,8 @@ def is_valid(url):
|
|||||||
return False
|
return False
|
||||||
elif is_a_loop_trap(url):
|
elif is_a_loop_trap(url):
|
||||||
return False
|
return False
|
||||||
|
elif not robots_ok(url):
|
||||||
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user