added robot.txt check
This commit is contained in:
parent
0e4187a5fa
commit
577fdb5a80
@ -1,7 +1,9 @@
|
|||||||
from operator import truediv
|
from operator import truediv
|
||||||
import re
|
import re
|
||||||
|
from urllib import robotparser
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
from urllib.robotparser import RobotFileParser
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
def scraper(url, resp):
|
def scraper(url, resp):
|
||||||
@ -18,6 +20,35 @@ def scraper(url, resp):
|
|||||||
invalid_links.write(link + "\n")
|
invalid_links.write(link + "\n")
|
||||||
return links_valid
|
return links_valid
|
||||||
|
|
||||||
|
# hopefuly fixes some loop traps and repeating (looping) directories
|
||||||
|
# the amount of repeated subdirectories allowed can be changed
|
||||||
|
# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
|
||||||
|
# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
|
||||||
|
def is_a_loop_trap(url):
|
||||||
|
word_dict = {}
|
||||||
|
parsed = urlparse(url)
|
||||||
|
url_path = str(parsed.path)
|
||||||
|
word_list = url_path.split('/')
|
||||||
|
for word in word_list:
|
||||||
|
if word in word_dict:
|
||||||
|
word_dict[word] += 1
|
||||||
|
if word_dict[word] == 3:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
word_dict[word] = 1
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Tests to see if the url is ok to be crawled by checking against the robots.txt
|
||||||
|
# file. It does so by checking the URL or URL prefixes
|
||||||
|
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
|
||||||
|
# http://pymotw.com/2/robotparser/
|
||||||
|
def robots_ok(baseurl):
|
||||||
|
eva = robotparser.RobotFileParser()
|
||||||
|
rooturl = str(urljoin(baseurl, '/')[:-1]) # get each subdomain by itself
|
||||||
|
eva.set_url(rooturl + "/robots.txt") # set location of robots.txt
|
||||||
|
eva.read() # read and fead to parser
|
||||||
|
return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl
|
||||||
|
|
||||||
def extract_next_links(url, resp):
|
def extract_next_links(url, resp):
|
||||||
# Implementation required.
|
# Implementation required.
|
||||||
# url: the URL that was used to get the page
|
# url: the URL that was used to get the page
|
||||||
@ -59,24 +90,6 @@ def extract_next_links(url, resp):
|
|||||||
print("Page error !")
|
print("Page error !")
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
# hopefuly fixes some loop traps and repeating (looping) directories
|
|
||||||
# the amount of repeated subdirectories allowed can be changed
|
|
||||||
# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
|
|
||||||
# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
|
|
||||||
def is_a_loop_trap(url):
|
|
||||||
word_dict = {}
|
|
||||||
parsed = urlparse(url)
|
|
||||||
url_path = str(parsed.path)
|
|
||||||
word_list = url_path.split('/')
|
|
||||||
for word in word_list:
|
|
||||||
if word in word_dict:
|
|
||||||
word_dict[word] += 1
|
|
||||||
if word_dict[word] == 3:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
word_dict[word] = 1
|
|
||||||
return False
|
|
||||||
|
|
||||||
#*.ics.uci.edu/*
|
#*.ics.uci.edu/*
|
||||||
#*.cs.uci.edu/*
|
#*.cs.uci.edu/*
|
||||||
#*.informatics.uci.edu/*
|
#*.informatics.uci.edu/*
|
||||||
|
Loading…
Reference in New Issue
Block a user