added robot.txt check

2022-04-18 11:29:43 -07:00
parent 0e4187a5fa
commit 577fdb5a80
1 changed files with 31 additions and 18 deletions
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -1,7 +1,9 @@
 from operator import truediv
 import re
+from urllib import robotparser
 from urllib.parse import urlparse
 from urllib.parse import urljoin
+from urllib.robotparser import RobotFileParser
 from bs4 import BeautifulSoup

 def scraper(url, resp):
@@ -18,6 +20,35 @@ def scraper(url, resp):
            invalid_links.write(link + "\n")
    return links_valid

+# hopefuly fixes some loop traps and repeating (looping) directories
+# the amount of repeated subdirectories allowed can be changed
+# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
+# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
+def is_a_loop_trap(url):
+    word_dict = {}
+    parsed = urlparse(url)
+    url_path = str(parsed.path)
+    word_list = url_path.split('/')
+    for word in word_list:
+        if word in word_dict:
+            word_dict[word] += 1
+            if word_dict[word] == 3:
+                return True
+        else:
+            word_dict[word] = 1
+    return False
+
+# Tests to see if the url is ok to be crawled by checking against the robots.txt
+# file. It does so by checking the URL or URL prefixes 
+# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
+# http://pymotw.com/2/robotparser/
+def robots_ok(baseurl):
+    eva = robotparser.RobotFileParser()
+    rooturl = str(urljoin(baseurl, '/')[:-1])   # get each subdomain by itself
+    eva.set_url(rooturl + "/robots.txt")         # set location of robots.txt 
+    eva.read()                                  # read and fead to parser
+    return eva.can_fetch('*', baseurl)          # returns true if useragent is allowed to crawl
+
 def extract_next_links(url, resp):
    # Implementation required.
    # url: the URL that was used to get the page
@@ -59,24 +90,6 @@ def extract_next_links(url, resp):
        print("Page error !")
    return pages

-# hopefuly fixes some loop traps and repeating (looping) directories
-# the amount of repeated subdirectories allowed can be changed
-# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website
-# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/
-def is_a_loop_trap(url):
-    word_dict = {}
-    parsed = urlparse(url)
-    url_path = str(parsed.path)
-    word_list = url_path.split('/')
-    for word in word_list:
-        if word in word_dict:
-            word_dict[word] += 1
-            if word_dict[word] == 3:
-                return True
-        else:
-            word_dict[word] = 1
-    return False
-
 #*.ics.uci.edu/*
 #*.cs.uci.edu/*
 #*.informatics.uci.edu/*