webcrawler/spacetime-crawler4py-master/utils/__init__.py
iNocturnis e19f68a6a6
Add files via upload
First Upload
2022-04-15 17:55:11 -07:00

36 lines
1.0 KiB
Python

import os
import logging
from hashlib import sha256
from urllib.parse import urlparse
def get_logger(name, filename=None):
logger = logging.getLogger(name)
logger.setLevel(logging.INFO)
if not os.path.exists("Logs"):
os.makedirs("Logs")
fh = logging.FileHandler(f"Logs/{filename if filename else name}.log")
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)
return logger
def get_urlhash(url):
parsed = urlparse(url)
# everything other than scheme.
return sha256(
f"{parsed.netloc}/{parsed.path}/{parsed.params}/"
f"{parsed.query}/{parsed.fragment}".encode("utf-8")).hexdigest()
def normalize(url):
if url.endswith("/"):
return url.rstrip("/")
return url