Merge remote-tracking branch 'origin/traps'

2022-04-20 17:49:34 -07:00 · 2022-04-20 17:49:34 -07:00 · b495292b87
commit b495292b87
parent 367a324ead 809b3dc820
2 changed files with 50 additions and 63 deletions
--- a/spacetime-crawler4py-master/robotsokay.py
+++ b/spacetime-crawler4py-master/robotsokay.py
@ -0,0 +1,35 @@
+import re
+from urllib import robotparser
+from bs4 import BeautifulSoup
+from collections import defaultdict
+import requests
+
+# Tests to see if the url is ok to be crawled by checking against the robots.txt
+# file. return true if page is allowed to be crawled, returns true if not robots file, and false otherwise
+# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
+# http://pymotw.com/2/robotparser/
+# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
+robots_seen = dict() # all robots go here (global so we can store over all site)
+def robots_ok(parsed)->bool:
+    global robots_seen                                  # global dict for files
+    robots_seen[parsed.netloc] = False                  # default seen
+    try:
+        url = 'http://' + parsed.netloc + '/robots.txt' # filter url and set
+        sitemap = requests.get(url)                     # sitmap get
+        if sitemap.status_code != 200:                  # no file so let her rip
+            return True
+        eva = robotparser.RobotFileParser(url)          
+        eva.read()
+        if eva.can_fetch('*', url):                     # if eva can see url add to dict
+            robots_seen[parsed.netloc] = True
+        return robots_seen[parsed.netloc]               # the dict 
+    except:
+        return False                                    # default
+# check if the site is in the dict if not run it into the dict
+def robots_are_ok(parsed):
+    global robots_seen
+    if parsed.netloc not in robots_seen: # if not in dict run check site
+        return robots_ok(parsed)
+    else:
+        return robots_seen[parsed.netloc] # if it has been read return its value
+                                            
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@ -1,25 +1,10 @@
 from distutils.filelist import findall
 from operator import truediv
 import re
-
-import urllib.request
-from urllib import robotparser
 from urllib.parse import urlparse
 from urllib.parse import urljoin
-from urllib.robotparser import RobotFileParser
 from bs4 import BeautifulSoup
-from nltk.tokenize import word_tokenize
-from nltk.corpus import stopwords
-from nltk.corpus import words
-import html2text
-import nltk
-#moved all my code to a separted py file and imported it here
-from datacollection import *
-
-# nltk.download('stopwords')
-# nltk.download('words')
-# there is another nltk.download() requirement but I removed it so i forgot what it was
-#       it'll show in the console/terminal if you run the code i believe. it appeared in mine
+from robotsokay import *

 def scraper(url, resp):
    links = extract_next_links(url, resp)
@ -68,35 +53,6 @@ def scraper(url, resp):

    return links_valid

-# hopefuly fixes some loop traps and repeating (looping) directories
-# the amount of repeated subdirectories allowed can be changed
-# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
-# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
-def is_a_loop_trap(url):
-    word_dict = {}
-    parsed = urlparse(url)
-    url_path = str(parsed.path)
-    word_list = url_path.split('/')
-    for word in word_list:
-        if word in word_dict:
-            word_dict[word] += 1
-            if word_dict[word] == 3:
-                return True
-        else:
-            word_dict[word] = 1
-    return False
-
-# Tests to see if the url is ok to be crawled by checking against the robots.txt
-# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled
-# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
-# http://pymotw.com/2/robotparser/
-def robots_ok(baseurl):
-    eva = robotparser.RobotFileParser()
-    rooturl = str(urljoin(baseurl, '/')[:-1])   # get each path by itself
-    eva.set_url(rooturl + "/robots.txt")        # set location of robots.txt 
-    eva.read()                                  # read and fead to parser
-    return eva.can_fetch('*', baseurl)          # returns true if useragent is allowed to crawl
-
 def extract_next_links(url, resp):
    # Implementation required.
    # url: the URL that was used to get the page
@ -130,12 +86,12 @@ def extract_next_links(url, resp):
            #skipping query with specific actions which mutate the websites and cause a trap
            if "do=" in href_link:
                continue
-            '''
-            # this is currently in the is_vaild but implimended in a different way, don't know which one would make more sense
-            # skip as not allowed
-            if not robots_ok(href_link):
+
+            # don't know if this is too expensive, otherwise idk
+            # takes parsed url and if not ok on robots goes next, else we can write file    
+            parsed = urlparse(href_link)    
+            if not robots_are_ok(parsed):
                continue
-            '''

            tempFile.write(href_link + "\n")
            #Adding to the boi wonder pages
@ -160,6 +116,7 @@ def is_valid(url):
    try:
        #Gotta check if they are in the domain
        parsed = urlparse(url)
+        url_parsed_path = parsed.path.lower()   # this may help speed things up a little bit (less calls to parsed.path)
        if parsed.scheme not in set(["http", "https"]):
            return False
        elif re.match(
@ -181,32 +138,27 @@ def is_valid(url):
            return False
        elif parsed.fragment:
            return False
-        elif is_a_loop_trap(url):
-            return False
-        # maybe this should go in the next link?
-        elif not robots_ok(url):
-            return False
        # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
        # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
        # we can adjust it based on what the cralwer does as well
-        elif len(url) > 169:
+        if len(url) > 169:
            return False
        # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters 
-        elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()):
+        if re.match(r".*(&filter%.*){3,}",url_parsed_path):
            return False
        # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
-        # elif re.match(r"^.*\&format=(\D{3,4})\Z$",parsed.path.lower()):
+        # elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
        #     return False
        # another looping directory check but more advanced than the one contained in is_a_trap
-        elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()):
+        if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path):
            return False
        # extra directories check (we can add as we find)
-        elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()):
+        if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path):
            return False
        # calendar checks plus adding or downloading calendar (ical)
-        elif re.match(r"^.*calendar.*$",parsed.path.lower()):
+        if re.match(r"^.*calendar.*$",url_parsed_path):
            return False
-        elif parsed.query.find('ical') != -1:
+        if parsed.query.find('ical') != -1:
            return False 
        else:
            return True