Merge remote-tracking branch 'origin/traps'

This commit is contained in:
iNocturnis 2022-04-20 17:49:34 -07:00
commit b495292b87
2 changed files with 50 additions and 63 deletions

View File

@ -0,0 +1,35 @@
import re
from urllib import robotparser
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
# Tests to see if the url is ok to be crawled by checking against the robots.txt
# file. return true if page is allowed to be crawled, returns true if not robots file, and false otherwise
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
# http://pymotw.com/2/robotparser/
# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
robots_seen = dict() # all robots go here (global so we can store over all site)
def robots_ok(parsed)->bool:
global robots_seen # global dict for files
robots_seen[parsed.netloc] = False # default seen
try:
url = 'http://' + parsed.netloc + '/robots.txt' # filter url and set
sitemap = requests.get(url) # sitmap get
if sitemap.status_code != 200: # no file so let her rip
return True
eva = robotparser.RobotFileParser(url)
eva.read()
if eva.can_fetch('*', url): # if eva can see url add to dict
robots_seen[parsed.netloc] = True
return robots_seen[parsed.netloc] # the dict
except:
return False # default
# check if the site is in the dict if not run it into the dict
def robots_are_ok(parsed):
global robots_seen
if parsed.netloc not in robots_seen: # if not in dict run check site
return robots_ok(parsed)
else:
return robots_seen[parsed.netloc] # if it has been read return its value

View File

@ -1,25 +1,10 @@
from distutils.filelist import findall from distutils.filelist import findall
from operator import truediv from operator import truediv
import re import re
import urllib.request
from urllib import robotparser
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.parse import urljoin from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize from robotsokay import *
from nltk.corpus import stopwords
from nltk.corpus import words
import html2text
import nltk
#moved all my code to a separted py file and imported it here
from datacollection import *
# nltk.download('stopwords')
# nltk.download('words')
# there is another nltk.download() requirement but I removed it so i forgot what it was
# it'll show in the console/terminal if you run the code i believe. it appeared in mine
def scraper(url, resp): def scraper(url, resp):
links = extract_next_links(url, resp) links = extract_next_links(url, resp)
@ -68,35 +53,6 @@ def scraper(url, resp):
return links_valid return links_valid
# hopefuly fixes some loop traps and repeating (looping) directories
# the amount of repeated subdirectories allowed can be changed
# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
def is_a_loop_trap(url):
word_dict = {}
parsed = urlparse(url)
url_path = str(parsed.path)
word_list = url_path.split('/')
for word in word_list:
if word in word_dict:
word_dict[word] += 1
if word_dict[word] == 3:
return True
else:
word_dict[word] = 1
return False
# Tests to see if the url is ok to be crawled by checking against the robots.txt
# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
# http://pymotw.com/2/robotparser/
def robots_ok(baseurl):
eva = robotparser.RobotFileParser()
rooturl = str(urljoin(baseurl, '/')[:-1]) # get each path by itself
eva.set_url(rooturl + "/robots.txt") # set location of robots.txt
eva.read() # read and fead to parser
return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl
def extract_next_links(url, resp): def extract_next_links(url, resp):
# Implementation required. # Implementation required.
# url: the URL that was used to get the page # url: the URL that was used to get the page
@ -130,13 +86,13 @@ def extract_next_links(url, resp):
#skipping query with specific actions which mutate the websites and cause a trap #skipping query with specific actions which mutate the websites and cause a trap
if "do=" in href_link: if "do=" in href_link:
continue continue
'''
# this is currently in the is_vaild but implimended in a different way, don't know which one would make more sense # don't know if this is too expensive, otherwise idk
# skip as not allowed # takes parsed url and if not ok on robots goes next, else we can write file
if not robots_ok(href_link): parsed = urlparse(href_link)
if not robots_are_ok(parsed):
continue continue
'''
tempFile.write(href_link + "\n") tempFile.write(href_link + "\n")
#Adding to the boi wonder pages #Adding to the boi wonder pages
pages.append(href_link) pages.append(href_link)
@ -160,6 +116,7 @@ def is_valid(url):
try: try:
#Gotta check if they are in the domain #Gotta check if they are in the domain
parsed = urlparse(url) parsed = urlparse(url)
url_parsed_path = parsed.path.lower() # this may help speed things up a little bit (less calls to parsed.path)
if parsed.scheme not in set(["http", "https"]): if parsed.scheme not in set(["http", "https"]):
return False return False
elif re.match( elif re.match(
@ -181,32 +138,27 @@ def is_valid(url):
return False return False
elif parsed.fragment: elif parsed.fragment:
return False return False
elif is_a_loop_trap(url):
return False
# maybe this should go in the next link?
elif not robots_ok(url):
return False
# https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
# length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought) # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
# we can adjust it based on what the cralwer does as well # we can adjust it based on what the cralwer does as well
elif len(url) > 169: if len(url) > 169:
return False return False
# this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()): if re.match(r".*(&filter%.*){3,}",url_parsed_path):
return False return False
# this is for urls which when opened, download a file (do we want to download these files and tokenize them) # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
# elif re.match(r"^.*\&format=(\D{3,4})\Z$",parsed.path.lower()): # elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
# return False # return False
# another looping directory check but more advanced than the one contained in is_a_trap # another looping directory check but more advanced than the one contained in is_a_trap
elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()): if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path):
return False return False
# extra directories check (we can add as we find) # extra directories check (we can add as we find)
elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()): if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path):
return False return False
# calendar checks plus adding or downloading calendar (ical) # calendar checks plus adding or downloading calendar (ical)
elif re.match(r"^.*calendar.*$",parsed.path.lower()): if re.match(r"^.*calendar.*$",url_parsed_path):
return False return False
elif parsed.query.find('ical') != -1: if parsed.query.find('ical') != -1:
return False return False
else: else:
return True return True