added robot.txt check

This commit is contained in:
Lacerum 2022-04-18 11:29:43 -07:00
parent 0e4187a5fa
commit 577fdb5a80

View File

@ -1,7 +1,9 @@
from operator import truediv from operator import truediv
import re import re
from urllib import robotparser
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.parse import urljoin from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
def scraper(url, resp): def scraper(url, resp):
@ -18,6 +20,35 @@ def scraper(url, resp):
invalid_links.write(link + "\n") invalid_links.write(link + "\n")
return links_valid return links_valid
# hopefuly fixes some loop traps and repeating (looping) directories
# the amount of repeated subdirectories allowed can be changed
# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
def is_a_loop_trap(url):
word_dict = {}
parsed = urlparse(url)
url_path = str(parsed.path)
word_list = url_path.split('/')
for word in word_list:
if word in word_dict:
word_dict[word] += 1
if word_dict[word] == 3:
return True
else:
word_dict[word] = 1
return False
# Tests to see if the url is ok to be crawled by checking against the robots.txt
# file. It does so by checking the URL or URL prefixes
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
# http://pymotw.com/2/robotparser/
def robots_ok(baseurl):
eva = robotparser.RobotFileParser()
rooturl = str(urljoin(baseurl, '/')[:-1]) # get each subdomain by itself
eva.set_url(rooturl + "/robots.txt") # set location of robots.txt
eva.read() # read and fead to parser
return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl
def extract_next_links(url, resp): def extract_next_links(url, resp):
# Implementation required. # Implementation required.
# url: the URL that was used to get the page # url: the URL that was used to get the page
@ -59,24 +90,6 @@ def extract_next_links(url, resp):
print("Page error !") print("Page error !")
return pages return pages
# hopefuly fixes some loop traps and repeating (looping) directories
# the amount of repeated subdirectories allowed can be changed
# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
def is_a_loop_trap(url):
word_dict = {}
parsed = urlparse(url)
url_path = str(parsed.path)
word_list = url_path.split('/')
for word in word_list:
if word in word_dict:
word_dict[word] += 1
if word_dict[word] == 3:
return True
else:
word_dict[word] = 1
return False
#*.ics.uci.edu/* #*.ics.uci.edu/*
#*.cs.uci.edu/* #*.cs.uci.edu/*
#*.informatics.uci.edu/* #*.informatics.uci.edu/*