Add files via upload

First Upload
This commit is contained in:
iNocturnis
2022-04-15 17:55:11 -07:00
committed by GitHub
commit e19f68a6a6
16 changed files with 689 additions and 0 deletions

View File

@@ -0,0 +1,35 @@
import os
import logging
from hashlib import sha256
from urllib.parse import urlparse
def get_logger(name, filename=None):
logger = logging.getLogger(name)
logger.setLevel(logging.INFO)
if not os.path.exists("Logs"):
os.makedirs("Logs")
fh = logging.FileHandler(f"Logs/{filename if filename else name}.log")
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)
return logger
def get_urlhash(url):
parsed = urlparse(url)
# everything other than scheme.
return sha256(
f"{parsed.netloc}/{parsed.path}/{parsed.params}/"
f"{parsed.query}/{parsed.fragment}".encode("utf-8")).hexdigest()
def normalize(url):
if url.endswith("/"):
return url.rstrip("/")
return url

View File

@@ -0,0 +1,19 @@
import re
class Config(object):
def __init__(self, config):
self.user_agent = config["IDENTIFICATION"]["USERAGENT"].strip()
print (self.user_agent)
assert self.user_agent != "DEFAULT AGENT", "Set useragent in config.ini"
assert re.match(r"^[a-zA-Z0-9_ ,]+$", self.user_agent), "User agent should not have any special characters outside '_', ',' and 'space'"
self.threads_count = int(config["LOCAL PROPERTIES"]["THREADCOUNT"])
self.save_file = config["LOCAL PROPERTIES"]["SAVE"]
self.host = config["CONNECTION"]["HOST"]
self.port = int(config["CONNECTION"]["PORT"])
self.seed_urls = config["CRAWLER"]["SEEDURL"].split(",")
self.time_delay = float(config["CRAWLER"]["POLITENESS"])
self.cache_server = None

View File

@@ -0,0 +1,36 @@
import requests
import cbor
import time
from utils.response import Response
def download(url, config, logger=None):
host, port = config.cache_server
#gotta check header first to make sure we are not downloading anything above 1MB, because 2MB is the avg size of a webpage with all images and scripts,
#roughly 1MB is the upper limit of just the html marking and stuff alone, that is essentially double the avg of a webpage without the other stuff
#
resp = requests.head(
f"http://{host}:{port}/",
params=[("q", f"{url}"), ("u", f"{config.user_agent}")])
if resp.headers['content-length'] and resp.headers['content-length'].isdigit():
if int(resp.headers['content-length']) > 1000000:
print(int(resp.headers['content-length']))
return Response({
"error": f"FILE TOO LARGE !",
"status": 606,
"url" : url})
resp = requests.get(
f"http://{host}:{port}/",
params=[("q", f"{url}"), ("u", f"{config.user_agent}")])
try:
if resp and resp.content:
return Response(cbor.loads(resp.content))
except (EOFError, ValueError) as e:
pass
logger.error(f"Spacetime Response error {resp} with url {url}.")
return Response({
"error": f"Spacetime Response error {resp} with url {url}.",
"status": resp.status_code,
"url": url})

View File

@@ -0,0 +1,15 @@
from rtypes import pcc_set, dimension, primarykey
@pcc_set
class Register(object):
crawler_id = primarykey(str)
load_balancer = dimension(tuple)
fresh = dimension(bool)
invalid = dimension(bool)
def __init__(self, crawler_id, fresh):
self.crawler_id = crawler_id
self.load_balancer = tuple()
self.fresh = fresh
self.invalid = False

View File

@@ -0,0 +1,14 @@
import pickle
class Response(object):
def __init__(self, resp_dict):
self.url = resp_dict["url"]
self.status = resp_dict["status"]
self.error = resp_dict["error"] if "error" in resp_dict else None
try:
self.raw_response = (
pickle.loads(resp_dict["response"])
if "response" in resp_dict else
None)
except TypeError:
self.raw_response = None

View File

@@ -0,0 +1,26 @@
import os
from spacetime import Node
from utils.pcc_models import Register
def init(df, user_agent, fresh):
reg = df.read_one(Register, user_agent)
if not reg:
reg = Register(user_agent, fresh)
df.add_one(Register, reg)
df.commit()
df.push_await()
while not reg.load_balancer:
df.pull_await()
if reg.invalid:
raise RuntimeError("User agent string is not acceptable.")
if reg.load_balancer:
df.delete_one(Register, reg)
df.commit()
df.push()
return reg.load_balancer
def get_cache_server(config, restart):
init_node = Node(
init, Types=[Register], dataframe=(config.host, config.port))
return init_node.start(
config.user_agent, restart or not os.path.exists(config.save_file))