From 2efcb22c58d75a039d1a96d44f87f98626430a89 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Sun, 17 Apr 2022 13:00:07 -0700 Subject: [PATCH 01/10] test create branch, place holder for trap fix --- spacetime-crawler4py-master/scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index dead1ea..cd78471 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -95,6 +95,7 @@ def is_valid(url): return False elif parsed.fragment: return False + # will add trap check here most likely else: return True From 0e4187a5fa459fc4b6ecb7bc0570727a0e2b7163 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Mon, 18 Apr 2022 02:25:03 -0700 Subject: [PATCH 02/10] added a looping and repeating trap fix --- spacetime-crawler4py-master/.gitignore | 4 ++++ spacetime-crawler4py-master/scraper.py | 22 +++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 spacetime-crawler4py-master/.gitignore diff --git a/spacetime-crawler4py-master/.gitignore b/spacetime-crawler4py-master/.gitignore new file mode 100644 index 0000000..416ebbb --- /dev/null +++ b/spacetime-crawler4py-master/.gitignore @@ -0,0 +1,4 @@ +__pycache__/* +logs/* +utils/* +crawler/__pycache__/* diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index cd78471..72db35b 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -1,3 +1,4 @@ +from operator import truediv import re from urllib.parse import urlparse from urllib.parse import urljoin @@ -58,6 +59,24 @@ def extract_next_links(url, resp): print("Page error !") return pages +# hopefuly fixes some loop traps and repeating (looping) directories +# the amount of repeated subdirectories allowed can be changed +# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website +# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/ +def is_a_loop_trap(url): + word_dict = {} + parsed = urlparse(url) + url_path = str(parsed.path) + word_list = url_path.split('/') + for word in word_list: + if word in word_dict: + word_dict[word] += 1 + if word_dict[word] == 3: + return True + else: + word_dict[word] = 1 + return False + #*.ics.uci.edu/* #*.cs.uci.edu/* #*.informatics.uci.edu/* @@ -95,7 +114,8 @@ def is_valid(url): return False elif parsed.fragment: return False - # will add trap check here most likely + elif is_a_loop_trap(url): + return False else: return True From 577fdb5a809f9635792107b4b7f6c1da3bd571a9 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Mon, 18 Apr 2022 11:29:43 -0700 Subject: [PATCH 03/10] added robot.txt check --- spacetime-crawler4py-master/scraper.py | 49 ++++++++++++++++---------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 72db35b..89ba22c 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -1,7 +1,9 @@ from operator import truediv import re +from urllib import robotparser from urllib.parse import urlparse from urllib.parse import urljoin +from urllib.robotparser import RobotFileParser from bs4 import BeautifulSoup def scraper(url, resp): @@ -18,6 +20,35 @@ def scraper(url, resp): invalid_links.write(link + "\n") return links_valid +# hopefuly fixes some loop traps and repeating (looping) directories +# the amount of repeated subdirectories allowed can be changed +# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website +# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/ +def is_a_loop_trap(url): + word_dict = {} + parsed = urlparse(url) + url_path = str(parsed.path) + word_list = url_path.split('/') + for word in word_list: + if word in word_dict: + word_dict[word] += 1 + if word_dict[word] == 3: + return True + else: + word_dict[word] = 1 + return False + +# Tests to see if the url is ok to be crawled by checking against the robots.txt +# file. It does so by checking the URL or URL prefixes +# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser +# http://pymotw.com/2/robotparser/ +def robots_ok(baseurl): + eva = robotparser.RobotFileParser() + rooturl = str(urljoin(baseurl, '/')[:-1]) # get each subdomain by itself + eva.set_url(rooturl + "/robots.txt") # set location of robots.txt + eva.read() # read and fead to parser + return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl + def extract_next_links(url, resp): # Implementation required. # url: the URL that was used to get the page @@ -59,24 +90,6 @@ def extract_next_links(url, resp): print("Page error !") return pages -# hopefuly fixes some loop traps and repeating (looping) directories -# the amount of repeated subdirectories allowed can be changed -# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website -# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/ -def is_a_loop_trap(url): - word_dict = {} - parsed = urlparse(url) - url_path = str(parsed.path) - word_list = url_path.split('/') - for word in word_list: - if word in word_dict: - word_dict[word] += 1 - if word_dict[word] == 3: - return True - else: - word_dict[word] = 1 - return False - #*.ics.uci.edu/* #*.cs.uci.edu/* #*.informatics.uci.edu/* From 1fbcb81faec60b212c97cabffd974100decddfbc Mon Sep 17 00:00:00 2001 From: Lacerum Date: Mon, 18 Apr 2022 11:54:47 -0700 Subject: [PATCH 04/10] forgot to add robot check in is_valid --- spacetime-crawler4py-master/scraper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 89ba22c..9518209 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -39,7 +39,7 @@ def is_a_loop_trap(url): return False # Tests to see if the url is ok to be crawled by checking against the robots.txt -# file. It does so by checking the URL or URL prefixes +# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser # http://pymotw.com/2/robotparser/ def robots_ok(baseurl): @@ -129,6 +129,8 @@ def is_valid(url): return False elif is_a_loop_trap(url): return False + elif not robots_ok(url): + return False else: return True From 0e5af0a4c7476c86ea09077c0ea0b0be9ad81621 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Mon, 18 Apr 2022 11:59:56 -0700 Subject: [PATCH 05/10] added commented out robot check in next link --- spacetime-crawler4py-master/scraper.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 9518209..cfa07d9 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -82,7 +82,13 @@ def extract_next_links(url, resp): #skipping query with specific actions which mutate the websites and cause a trap if "do=" in href_link: continue - + ''' + # this is currently in the is_vaild but implimended in a different way, don't know which one would make more sense + # skip as not allowed + if not robots_ok(href_link): + continue + ''' + tempFile.write(href_link + "\n") #Adding to the boi wonder pages pages.append(href_link) @@ -129,6 +135,7 @@ def is_valid(url): return False elif is_a_loop_trap(url): return False + # maybe this should go in the next link? elif not robots_ok(url): return False else: From 4080d46541a16c9f27d05add0957b90d13b39180 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Mon, 18 Apr 2022 18:04:11 -0700 Subject: [PATCH 06/10] added my todo for traps so far --- spacetime-crawler4py-master/scraper.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index cfa07d9..fdf6d60 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -138,6 +138,11 @@ def is_valid(url): # maybe this should go in the next link? elif not robots_ok(url): return False + # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression + # add lem check + # add another dir check + # add extra dir check + # add cal check else: return True From 4ace2164f2db63f53d369240bc0cfb945bec27ba Mon Sep 17 00:00:00 2001 From: Lacerum Date: Mon, 18 Apr 2022 18:38:16 -0700 Subject: [PATCH 07/10] more todos --- spacetime-crawler4py-master/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index fdf6d60..cba8c3b 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -141,7 +141,7 @@ def is_valid(url): # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression # add lem check # add another dir check - # add extra dir check + # add extra dir check (we can add as we find) # add cal check else: return True From 8f260cb1104a68f2f688c3aff2b7b15a22241168 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Tue, 19 Apr 2022 03:02:14 -0700 Subject: [PATCH 08/10] trap fixes based on internet and what I found --- spacetime-crawler4py-master/scraper.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index cba8c3b..9eb88ba 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -1,3 +1,4 @@ +from distutils.filelist import findall from operator import truediv import re from urllib import robotparser @@ -44,8 +45,8 @@ def is_a_loop_trap(url): # http://pymotw.com/2/robotparser/ def robots_ok(baseurl): eva = robotparser.RobotFileParser() - rooturl = str(urljoin(baseurl, '/')[:-1]) # get each subdomain by itself - eva.set_url(rooturl + "/robots.txt") # set location of robots.txt + rooturl = str(urljoin(baseurl, '/')[:-1]) # get each path by itself + eva.set_url(rooturl + "/robots.txt") # set location of robots.txt eva.read() # read and fead to parser return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl @@ -139,10 +140,21 @@ def is_valid(url): elif not robots_ok(url): return False # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression - # add lem check - # add another dir check - # add extra dir check (we can add as we find) - # add cal check + # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought) + # we can adjust it based on what the cralwer does as well + elif len(url) > 150: + return False + # another looping directory check but more advanced than the one contained in is_a_trap + elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()): + return False + # extra directories check (we can add as we find) + elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()): + return False + # calendar checks plus adding or downloading calendar (ical) + elif re.match(r"^.*calendar.*$",parsed.path.lower()): + return False + elif parsed.query.find('ical') != -1: + return False else: return True From 56e74c6b4baca84093061a0b14a961473d7702b5 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Tue, 19 Apr 2022 12:52:23 -0700 Subject: [PATCH 09/10] url len chg and added catch for repeating filter --- spacetime-crawler4py-master/scraper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 9eb88ba..0062760 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -142,7 +142,10 @@ def is_valid(url): # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought) # we can adjust it based on what the cralwer does as well - elif len(url) > 150: + elif len(url) > 169: + return False + # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters + elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()): return False # another looping directory check but more advanced than the one contained in is_a_trap elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()): From 03772651808842203d8dc8055b06333caa0f1c5c Mon Sep 17 00:00:00 2001 From: Lacerum Date: Tue, 19 Apr 2022 13:18:15 -0700 Subject: [PATCH 10/10] urls when opened download a file, keep or no, idk --- spacetime-crawler4py-master/scraper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 0062760..3f39144 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -147,6 +147,9 @@ def is_valid(url): # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()): return False + # this is for urls which when opened, download a file (do we want to download these files and tokenize them) + # elif re.match(r"^.*\&format=(\D{3,4})\Z$",parsed.path.lower()): + # return False # another looping directory check but more advanced than the one contained in is_a_trap elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()): return False