From 03772651808842203d8dc8055b06333caa0f1c5c Mon Sep 17 00:00:00 2001 From: Lacerum Date: Tue, 19 Apr 2022 13:18:15 -0700 Subject: [PATCH] urls when opened download a file, keep or no, idk --- spacetime-crawler4py-master/scraper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 0062760..3f39144 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -147,6 +147,9 @@ def is_valid(url): # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()): return False + # this is for urls which when opened, download a file (do we want to download these files and tokenize them) + # elif re.match(r"^.*\&format=(\D{3,4})\Z$",parsed.path.lower()): + # return False # another looping directory check but more advanced than the one contained in is_a_trap elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()): return False