moved robots ok to other file like datacollect

This commit is contained in:
Lacerum 2022-04-20 13:29:18 -07:00
parent ab39c4b8c6
commit 809b3dc820
2 changed files with 42 additions and 38 deletions

View File

@ -0,0 +1,35 @@
import re
from urllib import robotparser
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
# Tests to see if the url is ok to be crawled by checking against the robots.txt
# file. return true if page is allowed to be crawled, returns true if not robots file, and false otherwise
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
# http://pymotw.com/2/robotparser/
# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
robots_seen = dict() # all robots go here (global so we can store over all site)
def robots_ok(parsed)->bool:
global robots_seen # global dict for files
robots_seen[parsed.netloc] = False # default seen
try:
url = 'http://' + parsed.netloc + '/robots.txt' # filter url and set
sitemap = requests.get(url) # sitmap get
if sitemap.status_code != 200: # no file so let her rip
return True
eva = robotparser.RobotFileParser(url)
eva.read()
if eva.can_fetch('*', url): # if eva can see url add to dict
robots_seen[parsed.netloc] = True
return robots_seen[parsed.netloc] # the dict
except:
return False # default
# check if the site is in the dict if not run it into the dict
def robots_are_ok(parsed):
global robots_seen
if parsed.netloc not in robots_seen: # if not in dict run check site
return robots_ok(parsed)
else:
return robots_seen[parsed.netloc] # if it has been read return its value

View File

@ -1,13 +1,10 @@
from distutils.filelist import findall
from operator import truediv
import re
import requests
from urllib import robotparser
from collections import defaultdict
from urllib.parse import urlparse
from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
from robotsokay import *
def scraper(url, resp):
links = extract_next_links(url, resp)
@ -23,35 +20,6 @@ def scraper(url, resp):
invalid_links.write(link + "\n")
return links_valid
# Tests to see if the url is ok to be crawled by checking against the robots.txt
# file. return true if page is allowed to be crawled, returns true if not robots file, and false otherwise
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
# http://pymotw.com/2/robotparser/
# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
robots_seen = dict() # all robots go here (global so we can store over all site)
def robots_ok(parsed)->bool:
'''
eva = robotparser.RobotFileParser()
rooturl = str(urljoin(baseurl, '/')[:-1]) # get each path by itself
eva.set_url(rooturl + "/robots.txt") # set location of robots.txt
eva.read() # read and fead to parser
return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl
'''
global robots_seen # global dict for files
robots_seen[parsed.netloc] = False # default seen
try:
url = 'http://' + parsed.netloc + '/robots.txt' # filter url and set
sitemap = requests.get(url) # sitmap get
if sitemap.status_code != 200: # no file so let her rip
return True
eva = robotparser.RobotFileParser(url)
eva.read()
if eva.can_fetch('*', url): # if eva can see url add to dict
robots_seen[parsed.netloc] = True
return robots_seen[parsed.netloc] # the dict
except:
return False # default
def extract_next_links(url, resp):
# Implementation required.
# url: the URL that was used to get the page
@ -86,9 +54,10 @@ def extract_next_links(url, resp):
if "do=" in href_link:
continue
# idk if this is too expensive will have to test, don't think that it should go into is_vaild??
# don't know if this is too expensive, otherwise idk
# takes parsed url and if not ok on robots goes next, else we can write file
parsed = urlparse(href_link)
if not robots_ok(parsed):
if not robots_are_ok(parsed):
continue
tempFile.write(href_link + "\n")