From 0e5af0a4c7476c86ea09077c0ea0b0be9ad81621 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Mon, 18 Apr 2022 11:59:56 -0700 Subject: [PATCH] added commented out robot check in next link --- spacetime-crawler4py-master/scraper.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 9518209..cfa07d9 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -82,7 +82,13 @@ def extract_next_links(url, resp): #skipping query with specific actions which mutate the websites and cause a trap if "do=" in href_link: continue - + ''' + # this is currently in the is_vaild but implimended in a different way, don't know which one would make more sense + # skip as not allowed + if not robots_ok(href_link): + continue + ''' + tempFile.write(href_link + "\n") #Adding to the boi wonder pages pages.append(href_link) @@ -129,6 +135,7 @@ def is_valid(url): return False elif is_a_loop_trap(url): return False + # maybe this should go in the next link? elif not robots_ok(url): return False else: