added a looping and repeating trap fix

This commit is contained in:
Lacerum 2022-04-18 02:25:03 -07:00
parent 2efcb22c58
commit 0e4187a5fa
2 changed files with 25 additions and 1 deletions

View File

@ -0,0 +1,4 @@
__pycache__/*
logs/*
utils/*
crawler/__pycache__/*

View File

@ -1,3 +1,4 @@
from operator import truediv
import re import re
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.parse import urljoin from urllib.parse import urljoin
@ -58,6 +59,24 @@ def extract_next_links(url, resp):
print("Page error !") print("Page error !")
return pages return pages
# hopefuly fixes some loop traps and repeating (looping) directories
# the amount of repeated subdirectories allowed can be changed
# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
def is_a_loop_trap(url):
word_dict = {}
parsed = urlparse(url)
url_path = str(parsed.path)
word_list = url_path.split('/')
for word in word_list:
if word in word_dict:
word_dict[word] += 1
if word_dict[word] == 3:
return True
else:
word_dict[word] = 1
return False
#*.ics.uci.edu/* #*.ics.uci.edu/*
#*.cs.uci.edu/* #*.cs.uci.edu/*
#*.informatics.uci.edu/* #*.informatics.uci.edu/*
@ -95,7 +114,8 @@ def is_valid(url):
return False return False
elif parsed.fragment: elif parsed.fragment:
return False return False
# will add trap check here most likely elif is_a_loop_trap(url):
return False
else: else:
return True return True