46 lines
1.4 KiB
Python
46 lines
1.4 KiB
Python
import re
|
|
from urllib.parse import urlparse
|
|
|
|
def is_valid(url):
|
|
|
|
|
|
# Decide whether to crawl this url or not.
|
|
# If you decide to crawl it, return True; otherwise return False.
|
|
# There are already some conditions that return False.
|
|
|
|
try:
|
|
#Gotta check if they are in the domain
|
|
parsed = urlparse(url)
|
|
if parsed.scheme not in set(["http", "https"]):
|
|
return False
|
|
elif re.match(
|
|
r".*\.(css|js|bmp|gif|jpe?g|ico"
|
|
+ r"|png|tiff?|mid|mp2|mp3|mp4"
|
|
+ r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf"
|
|
+ r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names"
|
|
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
|
|
+ r"|epub|dll|cnf|tgz|sha1"
|
|
+ r"|thmx|mso|arff|rtf|jar|csv"
|
|
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()):
|
|
return False
|
|
elif not re.match(
|
|
r".*ics.uci.edu/.*"
|
|
+ r"|.*cs.uci.edu/.*"
|
|
+ r"|.*informatics.uci.edu/.*"
|
|
+ r"|.*stat.uci.edu/.*"
|
|
+ r"|today.uci.edu/department/information_computer_sciences/.*",url):
|
|
return False
|
|
elif parsed.fragment:
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
except TypeError:
|
|
print ("TypeError for ", parsed)
|
|
raise
|
|
|
|
with open("temp.txt") as file:
|
|
for line in file:
|
|
print(is_valid(line))
|
|
|