From 2efcb22c58d75a039d1a96d44f87f98626430a89 Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Sun, 17 Apr 2022 13:00:07 -0700
Subject: [PATCH 01/10] test create branch, place holder for trap fix

---
 spacetime-crawler4py-master/scraper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index dead1ea..cd78471 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -95,6 +95,7 @@ def is_valid(url):
             return False
         elif parsed.fragment:
             return False
+        # will add trap check here most likely 
         else:
             return True
 

From 0e4187a5fa459fc4b6ecb7bc0570727a0e2b7163 Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Mon, 18 Apr 2022 02:25:03 -0700
Subject: [PATCH 02/10] added a looping and repeating trap fix

---
 spacetime-crawler4py-master/.gitignore |  4 ++++
 spacetime-crawler4py-master/scraper.py | 22 +++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 spacetime-crawler4py-master/.gitignore

diff --git a/spacetime-crawler4py-master/.gitignore b/spacetime-crawler4py-master/.gitignore
new file mode 100644
index 0000000..416ebbb
--- /dev/null
+++ b/spacetime-crawler4py-master/.gitignore
@@ -0,0 +1,4 @@
+__pycache__/*
+logs/*
+utils/*
+crawler/__pycache__/*
diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index cd78471..72db35b 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -1,3 +1,4 @@
+from operator import truediv
 import re
 from urllib.parse import urlparse
 from urllib.parse import urljoin
@@ -58,6 +59,24 @@ def extract_next_links(url, resp):
         print("Page error !")
     return pages
 
+# hopefuly fixes some loop traps and repeating (looping) directories
+# the amount of repeated subdirectories allowed can be changed
+# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
+# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
+def is_a_loop_trap(url):
+    word_dict = {}
+    parsed = urlparse(url)
+    url_path = str(parsed.path)
+    word_list = url_path.split('/')
+    for word in word_list:
+        if word in word_dict:
+            word_dict[word] += 1
+            if word_dict[word] == 3:
+                return True
+        else:
+            word_dict[word] = 1
+    return False
+
 #*.ics.uci.edu/*
 #*.cs.uci.edu/*
 #*.informatics.uci.edu/*
@@ -95,7 +114,8 @@ def is_valid(url):
             return False
         elif parsed.fragment:
             return False
-        # will add trap check here most likely 
+        elif is_a_loop_trap(url):
+            return False
         else:
             return True
 

From 577fdb5a809f9635792107b4b7f6c1da3bd571a9 Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Mon, 18 Apr 2022 11:29:43 -0700
Subject: [PATCH 03/10] added robot.txt check

---
 spacetime-crawler4py-master/scraper.py | 49 ++++++++++++++++----------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index 72db35b..89ba22c 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -1,7 +1,9 @@
 from operator import truediv
 import re
+from urllib import robotparser
 from urllib.parse import urlparse
 from urllib.parse import urljoin
+from urllib.robotparser import RobotFileParser
 from bs4 import BeautifulSoup
 
 def scraper(url, resp):
@@ -18,6 +20,35 @@ def scraper(url, resp):
             invalid_links.write(link + "\n")
     return links_valid
 
+# hopefuly fixes some loop traps and repeating (looping) directories
+# the amount of repeated subdirectories allowed can be changed
+# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
+# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
+def is_a_loop_trap(url):
+    word_dict = {}
+    parsed = urlparse(url)
+    url_path = str(parsed.path)
+    word_list = url_path.split('/')
+    for word in word_list:
+        if word in word_dict:
+            word_dict[word] += 1
+            if word_dict[word] == 3:
+                return True
+        else:
+            word_dict[word] = 1
+    return False
+
+# Tests to see if the url is ok to be crawled by checking against the robots.txt
+# file. It does so by checking the URL or URL prefixes 
+# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
+# http://pymotw.com/2/robotparser/
+def robots_ok(baseurl):
+    eva = robotparser.RobotFileParser()
+    rooturl = str(urljoin(baseurl, '/')[:-1])   # get each subdomain by itself
+    eva.set_url(rooturl + "/robots.txt")         # set location of robots.txt 
+    eva.read()                                  # read and fead to parser
+    return eva.can_fetch('*', baseurl)          # returns true if useragent is allowed to crawl
+
 def extract_next_links(url, resp):
     # Implementation required.
     # url: the URL that was used to get the page
@@ -59,24 +90,6 @@ def extract_next_links(url, resp):
         print("Page error !")
     return pages
 
-# hopefuly fixes some loop traps and repeating (looping) directories
-# the amount of repeated subdirectories allowed can be changed
-# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
-# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
-def is_a_loop_trap(url):
-    word_dict = {}
-    parsed = urlparse(url)
-    url_path = str(parsed.path)
-    word_list = url_path.split('/')
-    for word in word_list:
-        if word in word_dict:
-            word_dict[word] += 1
-            if word_dict[word] == 3:
-                return True
-        else:
-            word_dict[word] = 1
-    return False
-
 #*.ics.uci.edu/*
 #*.cs.uci.edu/*
 #*.informatics.uci.edu/*

From 1fbcb81faec60b212c97cabffd974100decddfbc Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Mon, 18 Apr 2022 11:54:47 -0700
Subject: [PATCH 04/10] forgot to add robot check in is_valid

---
 spacetime-crawler4py-master/scraper.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index 89ba22c..9518209 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -39,7 +39,7 @@ def is_a_loop_trap(url):
     return False
 
 # Tests to see if the url is ok to be crawled by checking against the robots.txt
-# file. It does so by checking the URL or URL prefixes 
+# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled
 # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
 # http://pymotw.com/2/robotparser/
 def robots_ok(baseurl):
@@ -129,6 +129,8 @@ def is_valid(url):
             return False
         elif is_a_loop_trap(url):
             return False
+        elif not robots_ok(url):
+            return False
         else:
             return True
 

From 0e5af0a4c7476c86ea09077c0ea0b0be9ad81621 Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Mon, 18 Apr 2022 11:59:56 -0700
Subject: [PATCH 05/10] added commented out robot check in next link

---
 spacetime-crawler4py-master/scraper.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index 9518209..cfa07d9 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -82,7 +82,13 @@ def extract_next_links(url, resp):
             #skipping query with specific actions which mutate the websites and cause a trap
             if "do=" in href_link:
                 continue
-
+            '''
+            # this is currently in the is_vaild but implimended in a different way, don't know which one would make more sense
+            # skip as not allowed
+            if not robots_ok(href_link):
+                continue
+            '''
+            
             tempFile.write(href_link + "\n")
             #Adding to the boi wonder pages
             pages.append(href_link)
@@ -129,6 +135,7 @@ def is_valid(url):
             return False
         elif is_a_loop_trap(url):
             return False
+        # maybe this should go in the next link?
         elif not robots_ok(url):
             return False
         else:

From 4080d46541a16c9f27d05add0957b90d13b39180 Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Mon, 18 Apr 2022 18:04:11 -0700
Subject: [PATCH 06/10] added my todo for traps so far

---
 spacetime-crawler4py-master/scraper.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index cfa07d9..fdf6d60 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -138,6 +138,11 @@ def is_valid(url):
         # maybe this should go in the next link?
         elif not robots_ok(url):
             return False
+        # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
+        # add lem check
+        # add another dir check
+        # add extra dir check
+        # add cal check
         else:
             return True
 

From 4ace2164f2db63f53d369240bc0cfb945bec27ba Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Mon, 18 Apr 2022 18:38:16 -0700
Subject: [PATCH 07/10] more todos

---
 spacetime-crawler4py-master/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index fdf6d60..cba8c3b 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -141,7 +141,7 @@ def is_valid(url):
         # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
         # add lem check
         # add another dir check
-        # add extra dir check
+        # add extra dir check (we can add as we find)
         # add cal check
         else:
             return True

From 8f260cb1104a68f2f688c3aff2b7b15a22241168 Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Tue, 19 Apr 2022 03:02:14 -0700
Subject: [PATCH 08/10] trap fixes based on internet and what I found

---
 spacetime-crawler4py-master/scraper.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index cba8c3b..9eb88ba 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -1,3 +1,4 @@
+from distutils.filelist import findall
 from operator import truediv
 import re
 from urllib import robotparser
@@ -44,8 +45,8 @@ def is_a_loop_trap(url):
 # http://pymotw.com/2/robotparser/
 def robots_ok(baseurl):
     eva = robotparser.RobotFileParser()
-    rooturl = str(urljoin(baseurl, '/')[:-1])   # get each subdomain by itself
-    eva.set_url(rooturl + "/robots.txt")         # set location of robots.txt 
+    rooturl = str(urljoin(baseurl, '/')[:-1])   # get each path by itself
+    eva.set_url(rooturl + "/robots.txt")        # set location of robots.txt 
     eva.read()                                  # read and fead to parser
     return eva.can_fetch('*', baseurl)          # returns true if useragent is allowed to crawl
 
@@ -139,10 +140,21 @@ def is_valid(url):
         elif not robots_ok(url):
             return False
         # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
-        # add lem check
-        # add another dir check
-        # add extra dir check (we can add as we find)
-        # add cal check
+        # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
+        # we can adjust it based on what the cralwer does as well
+        elif len(url) > 150:
+            return False
+        # another looping directory check but more advanced than the one contained in is_a_trap
+        elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()):
+            return False
+        # extra directories check (we can add as we find)
+        elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()):
+            return False
+        # calendar checks plus adding or downloading calendar (ical)
+        elif re.match(r"^.*calendar.*$",parsed.path.lower()):
+            return False
+        elif parsed.query.find('ical') != -1:
+            return False 
         else:
             return True
 

From 56e74c6b4baca84093061a0b14a961473d7702b5 Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Tue, 19 Apr 2022 12:52:23 -0700
Subject: [PATCH 09/10] url len chg and added catch for repeating filter

---
 spacetime-crawler4py-master/scraper.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index 9eb88ba..0062760 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -142,7 +142,10 @@ def is_valid(url):
         # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
         # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
         # we can adjust it based on what the cralwer does as well
-        elif len(url) > 150:
+        elif len(url) > 169:
+            return False
+        # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters 
+        elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()):
             return False
         # another looping directory check but more advanced than the one contained in is_a_trap
         elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()):

From 03772651808842203d8dc8055b06333caa0f1c5c Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Tue, 19 Apr 2022 13:18:15 -0700
Subject: [PATCH 10/10] urls when opened download a file, keep or no, idk

---
 spacetime-crawler4py-master/scraper.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index 0062760..3f39144 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -147,6 +147,9 @@ def is_valid(url):
         # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters 
         elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()):
             return False
+        # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
+        # elif re.match(r"^.*\&format=(\D{3,4})\Z$",parsed.path.lower()):
+        #     return False
         # another looping directory check but more advanced than the one contained in is_a_trap
         elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()):
             return False