From 03772651808842203d8dc8055b06333caa0f1c5c Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Tue, 19 Apr 2022 13:18:15 -0700
Subject: [PATCH] urls when opened download a file, keep or no, idk

---
 spacetime-crawler4py-master/scraper.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index 0062760..3f39144 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -147,6 +147,9 @@ def is_valid(url):
         # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters 
         elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()):
             return False
+        # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
+        # elif re.match(r"^.*\&format=(\D{3,4})\Z$",parsed.path.lower()):
+        #     return False
         # another looping directory check but more advanced than the one contained in is_a_trap
         elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()):
             return False