Merge remote-tracking branch 'origin/traps'

2022-04-20 02:17:27 -07:00
parent e31ad13d40 0377265180
commit 367a324ead
2 changed files with 72 additions and 1 deletions
--- a/spacetime-crawler4py-master/.gitignore
+++ b/spacetime-crawler4py-master/.gitignore
@@ -0,0 +1,4 @@
+__pycache__/*
+logs/*
+utils/*
+crawler/__pycache__/*
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -1,7 +1,12 @@
+from distutils.filelist import findall
+from operator import truediv
 import re
+
 import urllib.request
+from urllib import robotparser
 from urllib.parse import urlparse
 from urllib.parse import urljoin
+from urllib.robotparser import RobotFileParser
 from bs4 import BeautifulSoup
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
@@ -63,6 +68,35 @@ def scraper(url, resp):

    return links_valid

+# hopefuly fixes some loop traps and repeating (looping) directories
+# the amount of repeated subdirectories allowed can be changed
+# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
+# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
+def is_a_loop_trap(url):
+    word_dict = {}
+    parsed = urlparse(url)
+    url_path = str(parsed.path)
+    word_list = url_path.split('/')
+    for word in word_list:
+        if word in word_dict:
+            word_dict[word] += 1
+            if word_dict[word] == 3:
+                return True
+        else:
+            word_dict[word] = 1
+    return False
+
+# Tests to see if the url is ok to be crawled by checking against the robots.txt
+# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled
+# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
+# http://pymotw.com/2/robotparser/
+def robots_ok(baseurl):
+    eva = robotparser.RobotFileParser()
+    rooturl = str(urljoin(baseurl, '/')[:-1])   # get each path by itself
+    eva.set_url(rooturl + "/robots.txt")        # set location of robots.txt 
+    eva.read()                                  # read and fead to parser
+    return eva.can_fetch('*', baseurl)          # returns true if useragent is allowed to crawl
+
 def extract_next_links(url, resp):
    # Implementation required.
    # url: the URL that was used to get the page
@@ -96,6 +130,12 @@ def extract_next_links(url, resp):
            #skipping query with specific actions which mutate the websites and cause a trap
            if "do=" in href_link:
                continue
+            '''
+            # this is currently in the is_vaild but implimended in a different way, don't know which one would make more sense
+            # skip as not allowed
+            if not robots_ok(href_link):
+                continue
+            '''
            
            tempFile.write(href_link + "\n")
            #Adding to the boi wonder pages
@@ -141,6 +181,33 @@ def is_valid(url):
            return False
        elif parsed.fragment:
            return False
+        elif is_a_loop_trap(url):
+            return False
+        # maybe this should go in the next link?
+        elif not robots_ok(url):
+            return False
+        # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
+        # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
+        # we can adjust it based on what the cralwer does as well
+        elif len(url) > 169:
+            return False
+        # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters 
+        elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()):
+            return False
+        # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
+        # elif re.match(r"^.*\&format=(\D{3,4})\Z$",parsed.path.lower()):
+        #     return False
+        # another looping directory check but more advanced than the one contained in is_a_trap
+        elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()):
+            return False
+        # extra directories check (we can add as we find)
+        elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()):
+            return False
+        # calendar checks plus adding or downloading calendar (ical)
+        elif re.match(r"^.*calendar.*$",parsed.path.lower()):
+            return False
+        elif parsed.query.find('ical') != -1:
+            return False 
        else:
            return True