Add files via upload

First Upload
2022-04-15 17:55:11 -07:00 · 2022-04-15 17:55:11 -07:00 · e19f68a6a6
commit e19f68a6a6
16 changed files with 689 additions and 0 deletions
--- a/spacetime-crawler4py-master/README.md
+++ b/spacetime-crawler4py-master/README.md
@ -0,0 +1,220 @@
+ABOUT
+-------------------------
+This is the base implementation of a full crawler that uses a spacetime
+cache server to receive requests.
+
+CONFIGURATION
+-------------------------
+
+### Step 1: Install dependencies
+
+If you do not have Python 3.6+:
+
+Windows: https://www.python.org/downloads/windows/
+
+Linux: https://docs.python-guide.org/starting/install3/linux/
+
+MAC: https://docs.python-guide.org/starting/install3/osx/
+
+Check if pip is installed by opening up a terminal/command prompt and typing
+the commands `python3 -m pip`. This should show the help menu for all the 
+commands possible with pip. If it does not, then get pip by following the
+instructions at https://pip.pypa.io/en/stable/installing/
+
+To install the dependencies for this project run the following two commands
+after ensuring pip is installed for the version of python you are using.
+Admin privileges might be required to execute the commands. Also make sure
+that the terminal is at the root folder of this project.
+```
+python -m pip install packages/spacetime-2.1.1-py3-none-any.whl
+python -m pip install -r packages/requirements.txt
+```
+
+### Step 2: Configuring config.ini
+
+Set the options in the config.ini file. The following
+configurations exist.
+
+**USERAGENT**: Set the useragent to `IR F19 uci-id1,uci-id2,uci-id3`. 
+It is important to set the useragent appropriately to get the credit for 
+hitting our cache.
+
+**HOST**: This is the host name of our caching server. Please set it as per spec.
+
+**PORT**: This is the port number of our caching server. Please set it as per spec.
+
+**SEEDURL**: The starting url that a crawler first starts downloading.
+
+**POLITENESS**: The time delay each thread has to wait for after each download.
+
+**SAVE**: The file that is used to save crawler progress. If you want to restart the
+crawler from the seed url, you can simply delete this file.
+
+**THREADCOUNT**: This can be a configuration used to increase the number of concurrent
+threads used. Do not change it if you have not implemented multi threading in
+the crawler. The crawler, as it is, is deliberately not thread safe.
+
+
+### Step 3: Define your scraper rules.
+
+Develop the definition of the function scraper in scraper.py
+
+```
+def scraper (url: str, resp: utils.response.Response): -> list
+    pass
+```
+
+The scraper takes in two parameters:
+
+**ARGS**
+
+*url*:
+
+The URL that was added to the frontier, and downloaded from the cache.
+It is of type str and was an url that was previously added to the
+frontier.
+
+*resp*:
+
+This is the response given by the caching server for the requested URL.
+The response is an object of type Response (see utils/response.py)
+```
+class Response:
+    Attributes:
+        url:
+            The URL identifying the response.
+        status:
+            An integer that identifies the status of the response. This
+            follows the same status codes of http.
+            (REF: https://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html)
+            In addition there are status codes provided by the caching
+            server (600-606) that define caching specific errors.
+        error:
+            If the status codes are between 600 and 606, the reason for
+            the error is provided in this attribute. Note that for status codes
+            (400-599), the error message is not put in this error attribute; instead it
+            must picked up from the raw_response (if any, and if useful).
+        raw_response:
+            If the status is between 200-599 (standard http), the raw
+            response object is the one defined by the requests library.
+            Useful resources in understanding this raw response object:
+                https://realpython.com/python-requests/#the-response
+                https://requests.kennethreitz.org/en/master/api/#requests.Response
+            HINT: raw_response.content gives you the webpage html content.
+```
+**Return Value**
+
+This function needs to return a list of urls that are scraped from the
+response. (An empty list for responses that are empty). These urls will be
+added to the Frontier and retrieved from the cache. These urls have to be
+filtered so that urls that do not have to be downloaded are not added to the
+frontier.
+
+The first step of filtering the urls can be by using the **is_valid** function
+provided in the same scraper.py file. Additional rules should be added to the is_valid function to filter the urls.
+
+EXECUTION
+-------------------------
+
+To execute the crawler run the launch.py command.
+```python3 launch.py```
+
+You can restart the crawler from the seed url
+(all current progress will be deleted) using the command
+```python3 launch.py --restart```
+
+You can specify a different config file to use by using the command with the option
+```python3 launch.py --config_file path/to/config```
+
+ARCHITECTURE
+-------------------------
+
+### FLOW
+
+The crawler receives a cache host and port from the spacetime servers
+and instantiates the config.
+
+It launches a crawler (defined in crawler/\_\_init\_\_.py L5) which creates a 
+Frontier and Worker(s) using the optional parameters frontier_factory, and
+worker_factory.
+
+When the crawler in started, workers are created that pick up an
+undownloaded link from the frontier, download it from our cache server, and
+pass the response to your scraper function. The links that are received by
+the scraper is added to the list of undownloaded links in the frontier and
+the url that was downloaded is marked as complete. The cycle continues until
+there are no more urls to be downloaded in the frontier.
+
+### REDEFINING THE FRONTIER:
+
+You can make your own frontier to use with the crawler if they meet this
+interface definition:
+```
+class Frontier:
+    def __init__(self, config, restart):
+        #Initializer.
+        # config -> Config object (defined in utils/config.py L1)
+        #           Note that the cache server is already defined at this
+        #           point.
+        # restart -> A bool that is True if the crawler has to restart
+        #           from the seed url and delete any current progress.
+
+    def get_tbd_url(self):
+        # Get one url that has to be downloaded.
+        # Can return None to signify the end of crawling.
+
+    def add_url(self, url):
+        # Adds one url to the frontier to be downloaded later.
+        # Checks can be made to prevent downloading duplicates.
+    
+    def mark_url_complete(self, url):
+        # mark a url as completed so that on restart, this url is not
+        # downloaded again.
+```
+A sample reference is given in utils/frontier.py L10. Note that this
+reference is not thread safe.
+
+### REDEFINING THE WORKER
+
+You can make your own worker to use with the crawler if they meet this
+interface definition:
+```
+from scraper import scraper
+from utils.download import download
+class Worker(Thread): # Worker must inherit from Thread or Process.
+    def __init__(self, worker_id, config, frontier):
+        # worker_id -> a unique id for the worker to self identify.
+        # config -> Config object (defined in utils/config.py L1)
+        #           Note that the cache server is already defined at this
+        #           point.
+        # frontier -> Frontier object created by the Crawler. Base reference
+        #           is shown in utils/frontier.py L10 but can be overloaded
+        #           as detailed above.
+        self.config = config
+        super().__init__(daemon=True)
+
+    def run(self):
+        In loop:
+            > url = get one undownloaded link from frontier.
+            > resp = download(url, self.config)
+            > next_links = scraper(url, resp)
+            > add next_links to frontier
+            > sleep for self.config.time_delay
+```
+A sample reference is given in utils/worker.py L9.
+
+THINGS TO KEEP IN MIND
+-------------------------
+
+1. It is important to filter out urls that do not point to a webpage. For
+   example, PDFs, PPTs, css, js, etc. The is_valid filters a large number of
+   such extensions, but there may be more.
+2. It is important to filter out urls that are not with ics.uci.edu domain.
+3. It is important to maintain the politeness to the cache server (on a per
+   domain basis).
+4. It is important to set the user agent in the config.ini correctly to get
+   credit for hitting the cache servers.
+5. Launching multiple instances of the crawler will download the same urls in
+   both. Mechanisms can be used to avoid that, however the politeness limits
+   still apply and will be checked.
+6. Do not attempt to download the links directly from ics servers.
--- a/spacetime-crawler4py-master/config.ini
+++ b/spacetime-crawler4py-master/config.ini
@ -0,0 +1,20 @@
+[IDENTIFICATION]
+# Set your user agent string here.
+USERAGENT = IR US22 19854690
+
+[CONNECTION]
+HOST = styx.ics.uci.edu
+PORT = 9000
+
+[CRAWLER]
+SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu
+# In seconds
+POLITENESS = 0.5
+
+[LOCAL PROPERTIES]
+# Save file for progress
+SAVE = frontier.shelve
+
+# IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING.
+THREADCOUNT = 1
+
--- a/spacetime-crawler4py-master/crawler/init.py
+++ b/spacetime-crawler4py-master/crawler/init.py
@ -0,0 +1,26 @@
+from utils import get_logger
+from crawler.frontier import Frontier
+from crawler.worker import Worker
+
+class Crawler(object):
+    def __init__(self, config, restart, frontier_factory=Frontier, worker_factory=Worker):
+        self.config = config
+        self.logger = get_logger("CRAWLER")
+        self.frontier = frontier_factory(config, restart)
+        self.workers = list()
+        self.worker_factory = worker_factory
+
+    def start_async(self):
+        self.workers = [
+            self.worker_factory(worker_id, self.config, self.frontier)
+            for worker_id in range(self.config.threads_count)]
+        for worker in self.workers:
+            worker.start()
+
+    def start(self):
+        self.start_async()
+        self.join()
+
+    def join(self):
+        for worker in self.workers:
+            worker.join()
--- a/spacetime-crawler4py-master/crawler/frontier.py
+++ b/spacetime-crawler4py-master/crawler/frontier.py
@ -0,0 +1,72 @@
+import os
+import shelve
+
+from threading import Thread, RLock
+from queue import Queue, Empty
+
+from utils import get_logger, get_urlhash, normalize
+from scraper import is_valid
+
+class Frontier(object):
+    def __init__(self, config, restart):
+        self.logger = get_logger("FRONTIER")
+        self.config = config
+        self.to_be_downloaded = list()
+        
+        if not os.path.exists(self.config.save_file) and not restart:
+            # Save file does not exist, but request to load save.
+            self.logger.info(
+                f"Did not find save file {self.config.save_file}, "
+                f"starting from seed.")
+        elif os.path.exists(self.config.save_file) and restart:
+            # Save file does exists, but request to start from seed.
+            self.logger.info(
+                f"Found save file {self.config.save_file}, deleting it.")
+            os.remove(self.config.save_file)
+        # Load existing save file, or create one if it does not exist.
+        self.save = shelve.open(self.config.save_file)
+        if restart:
+            for url in self.config.seed_urls:
+                self.add_url(url)
+        else:
+            # Set the frontier state with contents of save file.
+            self._parse_save_file()
+            if not self.save:
+                for url in self.config.seed_urls:
+                    self.add_url(url)
+
+    def _parse_save_file(self):
+        ''' This function can be overridden for alternate saving techniques. '''
+        total_count = len(self.save)
+        tbd_count = 0
+        for url, completed in self.save.values():
+            if not completed and is_valid(url):
+                self.to_be_downloaded.append(url)
+                tbd_count += 1
+        self.logger.info(
+            f"Found {tbd_count} urls to be downloaded from {total_count} "
+            f"total urls discovered.")
+
+    def get_tbd_url(self):
+        try:
+            return self.to_be_downloaded.pop()
+        except IndexError:
+            return None
+
+    def add_url(self, url):
+        url = normalize(url)
+        urlhash = get_urlhash(url)
+        if urlhash not in self.save:
+            self.save[urlhash] = (url, False)
+            self.save.sync()
+            self.to_be_downloaded.append(url)
+    
+    def mark_url_complete(self, url):
+        urlhash = get_urlhash(url)
+        if urlhash not in self.save:
+            # This should not happen.
+            self.logger.error(
+                f"Completed url {url}, but have not seen it before.")
+
+        self.save[urlhash] = (url, True)
+        self.save.sync()
--- a/spacetime-crawler4py-master/crawler/worker.py
+++ b/spacetime-crawler4py-master/crawler/worker.py
@ -0,0 +1,33 @@
+from threading import Thread
+
+from inspect import getsource
+from utils.download import download
+from utils import get_logger
+import scraper
+import time
+
+
+class Worker(Thread):
+    def __init__(self, worker_id, config, frontier):
+        self.logger = get_logger(f"Worker-{worker_id}", "Worker")
+        self.config = config
+        self.frontier = frontier
+        # basic check for requests in scraper
+        assert {getsource(scraper).find(req) for req in {"from requests import", "import requests"}} == {-1}, "Do not use requests from scraper.py"
+        super().__init__(daemon=True)
+        
+    def run(self):
+        while True:
+            tbd_url = self.frontier.get_tbd_url()
+            if not tbd_url:
+                self.logger.info("Frontier is empty. Stopping Crawler.")
+                break
+            resp = download(tbd_url, self.config, self.logger)
+            self.logger.info(
+                f"Downloaded {tbd_url}, status <{resp.status}>, "
+                f"using cache {self.config.cache_server}.")
+            scraped_urls = scraper.scraper(tbd_url, resp)
+            for scraped_url in scraped_urls:
+                self.frontier.add_url(scraped_url)
+            self.frontier.mark_url_complete(tbd_url)
+            time.sleep(self.config.time_delay)
--- a/spacetime-crawler4py-master/launch.py
+++ b/spacetime-crawler4py-master/launch.py
@ -0,0 +1,23 @@
+from configparser import ConfigParser
+from argparse import ArgumentParser
+
+from utils.server_registration import get_cache_server
+from utils.config import Config
+from crawler import Crawler
+
+
+def main(config_file, restart):
+    cparser = ConfigParser()
+    cparser.read(config_file)
+    config = Config(cparser)
+    config.cache_server = get_cache_server(config, restart)
+    crawler = Crawler(config, restart)
+    crawler.start()
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--restart", action="store_true", default=False)
+    parser.add_argument("--config_file", type=str, default="config.ini")
+    args = parser.parse_args()
+    main(args.config_file, args.restart)
--- a/spacetime-crawler4py-master/packages/requirements.txt
+++ b/spacetime-crawler4py-master/packages/requirements.txt
@ -0,0 +1,2 @@
+cbor
+requests
--- a/spacetime-crawler4py-master/packages/spacetime-2.1.1-py3-none-any.whl
+++ b/spacetime-crawler4py-master/packages/spacetime-2.1.1-py3-none-any.whl
--- a/spacetime-crawler4py-master/reg.py
+++ b/spacetime-crawler4py-master/reg.py
@ -0,0 +1,45 @@
+import re
+from urllib.parse import urlparse
+
+def is_valid(url):
+
+
+    # Decide whether to crawl this url or not. 
+    # If you decide to crawl it, return True; otherwise return False.
+    # There are already some conditions that return False.
+
+    try:
+        #Gotta check if they are in the domain
+        parsed = urlparse(url)
+        if parsed.scheme not in set(["http", "https"]):
+            return False
+        elif re.match(
+            r".*\.(css|js|bmp|gif|jpe?g|ico"
+            + r"|png|tiff?|mid|mp2|mp3|mp4"
+            + r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf"
+            + r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names"
+            + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+            + r"|epub|dll|cnf|tgz|sha1"
+            + r"|thmx|mso|arff|rtf|jar|csv"
+            + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()):
+            return False
+        elif not re.match(
+            r".*ics.uci.edu/.*"
+            + r"|.*cs.uci.edu/.*"
+            + r"|.*informatics.uci.edu/.*"
+            + r"|.*stat.uci.edu/.*"
+            + r"|today.uci.edu/department/information_computer_sciences/.*",url):
+            return False
+        elif parsed.fragment:
+            return False
+        else:
+            return True
+
+    except TypeError:
+        print ("TypeError for ", parsed)
+        raise
+
+with open("temp.txt") as file:
+    for line in file:
+        print(is_valid(line))
+
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@ -0,0 +1,103 @@
+import re
+from urllib.parse import urlparse
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup
+
+def scraper(url, resp):
+    links = extract_next_links(url, resp)
+    links_valid = list()
+    valid_links = open("valid_links.txt",'a')
+    invalid_links = open("invalid_links.txt",'a')
+    for link in links:
+        if is_valid(link):
+            links_valid.append(link)
+            valid_links.write(link + "\n")
+        else:
+            invalid_links.write("From: " + url + "\n")
+            invalid_links.write(link + "\n")
+    return links_valid
+
+def extract_next_links(url, resp):
+    # Implementation required.
+    # url: the URL that was used to get the page
+    # resp.url: the actual url of the page
+    # resp.status: the status code returned by the server. 200 is OK, you got the page. Other numbers mean that there was some kind of problem.
+    # resp.error: when status is not 200, you can check the error here, if needed.
+    # resp.raw_response: this is where the page actually is. More specifically, the raw_response has two parts:
+    #         resp.raw_response.url: the url, again
+    #         resp.raw_response.content: the content of the page!
+    # Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
+    pages = list()
+    if resp.status == 200:
+        #do stuff
+        soup = BeautifulSoup(resp.raw_response.content)
+        tempFile = open("test6.txt", 'a')
+        #Getting all the links, href = true means at least theres a href value, dont know what it is yet
+        for link in soup.find_all('a', href=True):
+            #There is a lot of relative paths stuff here gotta add them
+            href_link = link.get('href')
+
+            #Relative path
+            ##Suprisingly index fail safe
+            #some <a href> give //thenlink.com...
+            if href_link.startswith("//"):
+                href_link = href_link[2:]
+            
+            #Relative path fixing
+            if(href_link.startswith("/")):
+                href_link = urljoin(url,href_link)
+
+            #skipping query with specific actions which mutate the websites and cause a trap
+            if "do=" in href_link:
+                continue
+
+            tempFile.write(href_link + "\n")
+            #Adding to the boi wonder pages
+            pages.append(href_link)
+    else:
+        print("Page error !")
+    return pages
+
+#*.ics.uci.edu/*
+#*.cs.uci.edu/*
+#*.informatics.uci.edu/*
+#*.stat.uci.edu/*
+#today.uci.edu/department/information_computer_sciences/*
+
+def is_valid(url):
+
+
+    # Decide whether to crawl this url or not. 
+    # If you decide to crawl it, return True; otherwise return False.
+    # There are already some conditions that return False.
+
+    try:
+        #Gotta check if they are in the domain
+        parsed = urlparse(url)
+        if parsed.scheme not in set(["http", "https"]):
+            return False
+        elif re.match(
+            r".*\.(css|js|bmp|gif|jpe?g|ico"
+            + r"|png|tiff?|mid|mp2|mp3|mp4"
+            + r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf"
+            + r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names"
+            + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+            + r"|epub|dll|cnf|tgz|sha1"
+            + r"|thmx|mso|arff|rtf|jar|csv"
+            + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()):
+            return False
+        elif not re.match(
+            r".*ics.uci.edu/.*"
+            + r"|.*cs.uci.edu/.*"
+            + r"|.*informatics.uci.edu/.*"
+            + r"|.*stat.uci.edu/.*"
+            + r"|today.uci.edu/department/information_computer_sciences/.*",url):
+            return False
+        elif parsed.fragment:
+            return False
+        else:
+            return True
+
+    except TypeError:
+        print ("TypeError for ", parsed)
+        raise
--- a/spacetime-crawler4py-master/utils/init.py
+++ b/spacetime-crawler4py-master/utils/init.py
@ -0,0 +1,35 @@
+import os
+import logging
+from hashlib import sha256
+from urllib.parse import urlparse
+
+def get_logger(name, filename=None):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    if not os.path.exists("Logs"):
+        os.makedirs("Logs")
+    fh = logging.FileHandler(f"Logs/{filename if filename else name}.log")
+    fh.setLevel(logging.DEBUG)
+    ch = logging.StreamHandler()
+    ch.setLevel(logging.INFO)
+    formatter = logging.Formatter(
+       "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    fh.setFormatter(formatter)
+    ch.setFormatter(formatter)
+    # add the handlers to the logger
+    logger.addHandler(fh)
+    logger.addHandler(ch)
+    return logger
+
+
+def get_urlhash(url):
+    parsed = urlparse(url)
+    # everything other than scheme.
+    return sha256(
+        f"{parsed.netloc}/{parsed.path}/{parsed.params}/"
+        f"{parsed.query}/{parsed.fragment}".encode("utf-8")).hexdigest()
+
+def normalize(url):
+    if url.endswith("/"):
+        return url.rstrip("/")
+    return url
--- a/spacetime-crawler4py-master/utils/config.py
+++ b/spacetime-crawler4py-master/utils/config.py
@ -0,0 +1,19 @@
+import re
+
+
+class Config(object):
+    def __init__(self, config):
+        self.user_agent = config["IDENTIFICATION"]["USERAGENT"].strip()
+        print (self.user_agent)
+        assert self.user_agent != "DEFAULT AGENT", "Set useragent in config.ini"
+        assert re.match(r"^[a-zA-Z0-9_ ,]+$", self.user_agent), "User agent should not have any special characters outside '_', ',' and 'space'"
+        self.threads_count = int(config["LOCAL PROPERTIES"]["THREADCOUNT"])
+        self.save_file = config["LOCAL PROPERTIES"]["SAVE"]
+
+        self.host = config["CONNECTION"]["HOST"]
+        self.port = int(config["CONNECTION"]["PORT"])
+
+        self.seed_urls = config["CRAWLER"]["SEEDURL"].split(",")
+        self.time_delay = float(config["CRAWLER"]["POLITENESS"])
+
+        self.cache_server = None
--- a/spacetime-crawler4py-master/utils/download.py
+++ b/spacetime-crawler4py-master/utils/download.py
@ -0,0 +1,36 @@
+import requests
+import cbor
+import time
+
+from utils.response import Response
+
+def download(url, config, logger=None):
+    host, port = config.cache_server
+    #gotta check header first to make sure we are not downloading anything above 1MB, because 2MB is the avg size of a webpage with all images and scripts,
+    #roughly 1MB is the upper limit of just the html marking and stuff alone, that is essentially double the avg of a webpage without the other stuff
+    #
+    resp = requests.head(
+        f"http://{host}:{port}/",
+        params=[("q", f"{url}"), ("u", f"{config.user_agent}")])
+    
+    if resp.headers['content-length'] and resp.headers['content-length'].isdigit():
+        if int(resp.headers['content-length']) > 1000000:
+            print(int(resp.headers['content-length']))
+            return Response({
+                "error": f"FILE TOO LARGE !",
+                "status": 606,
+                "url" : url})
+
+    resp = requests.get(
+        f"http://{host}:{port}/",
+        params=[("q", f"{url}"), ("u", f"{config.user_agent}")])
+    try:
+        if resp and resp.content:
+            return Response(cbor.loads(resp.content))
+    except (EOFError, ValueError) as e:
+        pass
+    logger.error(f"Spacetime Response error {resp} with url {url}.")
+    return Response({
+        "error": f"Spacetime Response error {resp} with url {url}.",
+        "status": resp.status_code,
+        "url": url})
--- a/spacetime-crawler4py-master/utils/pcc_models.py
+++ b/spacetime-crawler4py-master/utils/pcc_models.py
@ -0,0 +1,15 @@
+from rtypes import pcc_set, dimension, primarykey
+
+
+@pcc_set
+class Register(object):
+    crawler_id = primarykey(str)
+    load_balancer = dimension(tuple)
+    fresh = dimension(bool)
+    invalid = dimension(bool)
+
+    def __init__(self, crawler_id, fresh):
+        self.crawler_id = crawler_id
+        self.load_balancer = tuple()
+        self.fresh = fresh
+        self.invalid = False
--- a/spacetime-crawler4py-master/utils/response.py
+++ b/spacetime-crawler4py-master/utils/response.py
@ -0,0 +1,14 @@
+import pickle
+
+class Response(object):
+    def __init__(self, resp_dict):
+        self.url = resp_dict["url"]
+        self.status = resp_dict["status"]
+        self.error = resp_dict["error"] if "error" in resp_dict else None
+        try:
+            self.raw_response = (
+                pickle.loads(resp_dict["response"])
+                if "response" in resp_dict else
+                None)
+        except TypeError:
+            self.raw_response = None
--- a/spacetime-crawler4py-master/utils/server_registration.py
+++ b/spacetime-crawler4py-master/utils/server_registration.py
@ -0,0 +1,26 @@
+import os
+from spacetime import Node
+from utils.pcc_models import Register
+
+def init(df, user_agent, fresh):
+    reg = df.read_one(Register, user_agent)
+    if not reg:
+        reg = Register(user_agent, fresh)
+        df.add_one(Register, reg)
+        df.commit()
+        df.push_await()
+    while not reg.load_balancer:
+        df.pull_await()
+        if reg.invalid:
+            raise RuntimeError("User agent string is not acceptable.")
+        if reg.load_balancer:
+            df.delete_one(Register, reg)
+            df.commit()
+            df.push()
+    return reg.load_balancer
+
+def get_cache_server(config, restart):
+    init_node = Node(
+        init, Types=[Register], dataframe=(config.host, config.port))
+    return init_node.start(
+        config.user_agent, restart or not os.path.exists(config.save_file))