Add files via upload
First Upload
This commit is contained in:
35
spacetime-crawler4py-master/utils/__init__.py
Normal file
35
spacetime-crawler4py-master/utils/__init__.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import os
|
||||
import logging
|
||||
from hashlib import sha256
|
||||
from urllib.parse import urlparse
|
||||
|
||||
def get_logger(name, filename=None):
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(logging.INFO)
|
||||
if not os.path.exists("Logs"):
|
||||
os.makedirs("Logs")
|
||||
fh = logging.FileHandler(f"Logs/{filename if filename else name}.log")
|
||||
fh.setLevel(logging.DEBUG)
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.INFO)
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
fh.setFormatter(formatter)
|
||||
ch.setFormatter(formatter)
|
||||
# add the handlers to the logger
|
||||
logger.addHandler(fh)
|
||||
logger.addHandler(ch)
|
||||
return logger
|
||||
|
||||
|
||||
def get_urlhash(url):
|
||||
parsed = urlparse(url)
|
||||
# everything other than scheme.
|
||||
return sha256(
|
||||
f"{parsed.netloc}/{parsed.path}/{parsed.params}/"
|
||||
f"{parsed.query}/{parsed.fragment}".encode("utf-8")).hexdigest()
|
||||
|
||||
def normalize(url):
|
||||
if url.endswith("/"):
|
||||
return url.rstrip("/")
|
||||
return url
|
19
spacetime-crawler4py-master/utils/config.py
Normal file
19
spacetime-crawler4py-master/utils/config.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import re
|
||||
|
||||
|
||||
class Config(object):
|
||||
def __init__(self, config):
|
||||
self.user_agent = config["IDENTIFICATION"]["USERAGENT"].strip()
|
||||
print (self.user_agent)
|
||||
assert self.user_agent != "DEFAULT AGENT", "Set useragent in config.ini"
|
||||
assert re.match(r"^[a-zA-Z0-9_ ,]+$", self.user_agent), "User agent should not have any special characters outside '_', ',' and 'space'"
|
||||
self.threads_count = int(config["LOCAL PROPERTIES"]["THREADCOUNT"])
|
||||
self.save_file = config["LOCAL PROPERTIES"]["SAVE"]
|
||||
|
||||
self.host = config["CONNECTION"]["HOST"]
|
||||
self.port = int(config["CONNECTION"]["PORT"])
|
||||
|
||||
self.seed_urls = config["CRAWLER"]["SEEDURL"].split(",")
|
||||
self.time_delay = float(config["CRAWLER"]["POLITENESS"])
|
||||
|
||||
self.cache_server = None
|
36
spacetime-crawler4py-master/utils/download.py
Normal file
36
spacetime-crawler4py-master/utils/download.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import requests
|
||||
import cbor
|
||||
import time
|
||||
|
||||
from utils.response import Response
|
||||
|
||||
def download(url, config, logger=None):
|
||||
host, port = config.cache_server
|
||||
#gotta check header first to make sure we are not downloading anything above 1MB, because 2MB is the avg size of a webpage with all images and scripts,
|
||||
#roughly 1MB is the upper limit of just the html marking and stuff alone, that is essentially double the avg of a webpage without the other stuff
|
||||
#
|
||||
resp = requests.head(
|
||||
f"http://{host}:{port}/",
|
||||
params=[("q", f"{url}"), ("u", f"{config.user_agent}")])
|
||||
|
||||
if resp.headers['content-length'] and resp.headers['content-length'].isdigit():
|
||||
if int(resp.headers['content-length']) > 1000000:
|
||||
print(int(resp.headers['content-length']))
|
||||
return Response({
|
||||
"error": f"FILE TOO LARGE !",
|
||||
"status": 606,
|
||||
"url" : url})
|
||||
|
||||
resp = requests.get(
|
||||
f"http://{host}:{port}/",
|
||||
params=[("q", f"{url}"), ("u", f"{config.user_agent}")])
|
||||
try:
|
||||
if resp and resp.content:
|
||||
return Response(cbor.loads(resp.content))
|
||||
except (EOFError, ValueError) as e:
|
||||
pass
|
||||
logger.error(f"Spacetime Response error {resp} with url {url}.")
|
||||
return Response({
|
||||
"error": f"Spacetime Response error {resp} with url {url}.",
|
||||
"status": resp.status_code,
|
||||
"url": url})
|
15
spacetime-crawler4py-master/utils/pcc_models.py
Normal file
15
spacetime-crawler4py-master/utils/pcc_models.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from rtypes import pcc_set, dimension, primarykey
|
||||
|
||||
|
||||
@pcc_set
|
||||
class Register(object):
|
||||
crawler_id = primarykey(str)
|
||||
load_balancer = dimension(tuple)
|
||||
fresh = dimension(bool)
|
||||
invalid = dimension(bool)
|
||||
|
||||
def __init__(self, crawler_id, fresh):
|
||||
self.crawler_id = crawler_id
|
||||
self.load_balancer = tuple()
|
||||
self.fresh = fresh
|
||||
self.invalid = False
|
14
spacetime-crawler4py-master/utils/response.py
Normal file
14
spacetime-crawler4py-master/utils/response.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import pickle
|
||||
|
||||
class Response(object):
|
||||
def __init__(self, resp_dict):
|
||||
self.url = resp_dict["url"]
|
||||
self.status = resp_dict["status"]
|
||||
self.error = resp_dict["error"] if "error" in resp_dict else None
|
||||
try:
|
||||
self.raw_response = (
|
||||
pickle.loads(resp_dict["response"])
|
||||
if "response" in resp_dict else
|
||||
None)
|
||||
except TypeError:
|
||||
self.raw_response = None
|
26
spacetime-crawler4py-master/utils/server_registration.py
Normal file
26
spacetime-crawler4py-master/utils/server_registration.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import os
|
||||
from spacetime import Node
|
||||
from utils.pcc_models import Register
|
||||
|
||||
def init(df, user_agent, fresh):
|
||||
reg = df.read_one(Register, user_agent)
|
||||
if not reg:
|
||||
reg = Register(user_agent, fresh)
|
||||
df.add_one(Register, reg)
|
||||
df.commit()
|
||||
df.push_await()
|
||||
while not reg.load_balancer:
|
||||
df.pull_await()
|
||||
if reg.invalid:
|
||||
raise RuntimeError("User agent string is not acceptable.")
|
||||
if reg.load_balancer:
|
||||
df.delete_one(Register, reg)
|
||||
df.commit()
|
||||
df.push()
|
||||
return reg.load_balancer
|
||||
|
||||
def get_cache_server(config, restart):
|
||||
init_node = Node(
|
||||
init, Types=[Register], dataframe=(config.host, config.port))
|
||||
return init_node.start(
|
||||
config.user_agent, restart or not os.path.exists(config.save_file))
|
Reference in New Issue
Block a user