moved robots ok to other file like datacollect

2022-04-20 13:29:18 -07:00
parent ab39c4b8c6
commit 809b3dc820
2 changed files with 42 additions and 38 deletions
--- a/spacetime-crawler4py-master/robotsokay.py
+++ b/spacetime-crawler4py-master/robotsokay.py
@@ -0,0 +1,35 @@
 import re
 from urllib import robotparser
 from bs4 import BeautifulSoup
 from collections import defaultdict
 import requests
 # Tests to see if the url is ok to be crawled by checking against the robots.txt
 # file. return true if page is allowed to be crawled, returns true if not robots file, and false otherwise
 # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
 # http://pymotw.com/2/robotparser/
 # https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
 robots_seen = dict() # all robots go here (global so we can store over all site)
 def robots_ok(parsed)->bool:
    global robots_seen                                  # global dict for files
    robots_seen[parsed.netloc] = False                  # default seen
    try:
        url = 'http://' + parsed.netloc + '/robots.txt' # filter url and set
        sitemap = requests.get(url)                     # sitmap get
        if sitemap.status_code != 200:                  # no file so let her rip
            return True
        eva = robotparser.RobotFileParser(url)          
        eva.read()
        if eva.can_fetch('*', url):                     # if eva can see url add to dict
            robots_seen[parsed.netloc] = True
        return robots_seen[parsed.netloc]               # the dict 
    except:
        return False                                    # default
 # check if the site is in the dict if not run it into the dict
 def robots_are_ok(parsed):
    global robots_seen
    if parsed.netloc not in robots_seen: # if not in dict run check site
        return robots_ok(parsed)
    else:
        return robots_seen[parsed.netloc] # if it has been read return its value
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -1,13 +1,10 @@
 from distutils.filelist import findall
 from operator import truediv
 import re
 import requests
 from urllib import robotparser
 from collections import defaultdict
 from urllib.parse import urlparse
 from urllib.parse import urljoin
 from urllib.robotparser import RobotFileParser
 from bs4 import BeautifulSoup
 from robotsokay import *
 def scraper(url, resp):
    links = extract_next_links(url, resp)
@@ -23,35 +20,6 @@ def scraper(url, resp):
            invalid_links.write(link + "\n")
    return links_valid
 # Tests to see if the url is ok to be crawled by checking against the robots.txt
 # file. return true if page is allowed to be crawled, returns true if not robots file, and false otherwise
 # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
 # http://pymotw.com/2/robotparser/
 # https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
 robots_seen = dict() # all robots go here (global so we can store over all site)
 def robots_ok(parsed)->bool:
    '''
    eva = robotparser.RobotFileParser()
    rooturl = str(urljoin(baseurl, '/')[:-1])   # get each path by itself
    eva.set_url(rooturl + "/robots.txt")        # set location of robots.txt 
    eva.read()                                  # read and fead to parser
    return eva.can_fetch('*', baseurl)          # returns true if useragent is allowed to crawl
    '''
    global robots_seen                                  # global dict for files
    robots_seen[parsed.netloc] = False                  # default seen
    try:
        url = 'http://' + parsed.netloc + '/robots.txt' # filter url and set
        sitemap = requests.get(url)                     # sitmap get
        if sitemap.status_code != 200:                  # no file so let her rip
            return True
        eva = robotparser.RobotFileParser(url)          
        eva.read()
        if eva.can_fetch('*', url):                     # if eva can see url add to dict
            robots_seen[parsed.netloc] = True
        return robots_seen[parsed.netloc]               # the dict 
    except:
        return False                                    # default
 def extract_next_links(url, resp):
    # Implementation required.
    # url: the URL that was used to get the page
@@ -85,12 +53,13 @@ def extract_next_links(url, resp):
            #skipping query with specific actions which mutate the websites and cause a trap
            if "do=" in href_link:
                continue
-        
+
-            # idk if this is too expensive will have to test, don't think that it should go into is_vaild??
+            # don't know if this is too expensive, otherwise idk
-            parsed = urlparse(href_link)
+            # takes parsed url and if not ok on robots goes next, else we can write file    
-            if not robots_ok(parsed):
+            parsed = urlparse(href_link)    
            if not robots_are_ok(parsed):
                continue
-            
+
            tempFile.write(href_link + "\n")
            #Adding to the boi wonder pages
            pages.append(href_link)