From 0e4187a5fa459fc4b6ecb7bc0570727a0e2b7163 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Mon, 18 Apr 2022 02:25:03 -0700 Subject: [PATCH] added a looping and repeating trap fix --- spacetime-crawler4py-master/.gitignore | 4 ++++ spacetime-crawler4py-master/scraper.py | 22 +++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 spacetime-crawler4py-master/.gitignore diff --git a/spacetime-crawler4py-master/.gitignore b/spacetime-crawler4py-master/.gitignore new file mode 100644 index 0000000..416ebbb --- /dev/null +++ b/spacetime-crawler4py-master/.gitignore @@ -0,0 +1,4 @@ +__pycache__/* +logs/* +utils/* +crawler/__pycache__/* diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index cd78471..72db35b 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -1,3 +1,4 @@ +from operator import truediv import re from urllib.parse import urlparse from urllib.parse import urljoin @@ -58,6 +59,24 @@ def extract_next_links(url, resp): print("Page error !") return pages +# hopefuly fixes some loop traps and repeating (looping) directories +# the amount of repeated subdirectories allowed can be changed +# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website +# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/ +def is_a_loop_trap(url): + word_dict = {} + parsed = urlparse(url) + url_path = str(parsed.path) + word_list = url_path.split('/') + for word in word_list: + if word in word_dict: + word_dict[word] += 1 + if word_dict[word] == 3: + return True + else: + word_dict[word] = 1 + return False + #*.ics.uci.edu/* #*.cs.uci.edu/* #*.informatics.uci.edu/* @@ -95,7 +114,8 @@ def is_valid(url): return False elif parsed.fragment: return False - # will add trap check here most likely + elif is_a_loop_trap(url): + return False else: return True