webcrawler/spacetime-crawler4py-master/crawler/worker.py

from threading import Thread

from inspect import getsource
from utils.download import download
from utils import get_logger
import scraper
import time


class Worker(Thread):
    def __init__(self, worker_id, config, frontier):
        self.logger = get_logger(f"Worker-{worker_id}", "Worker")
        self.config = config
        self.frontier = frontier
        # basic check for requests in scraper
        assert {getsource(scraper).find(req) for req in {"from requests import", "import requests"}} == {-1}, "Do not use requests from scraper.py"
        super().__init__(daemon=True)

    def run(self):
        while True:
            start = time.perf_counter()
            tic = time.perf_counter()
            tbd_url = self.frontier.get_tbd_url()
            toc = time.perf_counter()
            print(f"Took {toc - tic:0.4f} seconds to get_tbd_url")
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break
            self.frontier.acquire_polite(tbd_url)
            tic = time.perf_counter()
            resp = download(tbd_url, self.config, self.logger)
            toc = time.perf_counter()
            print(f"Took {toc - tic:0.4f} seconds to do download url")

            self.logger.info(
                f"Downloaded {tbd_url}, status <{resp.status}>, "
                f"using cache {self.config.cache_server}.")
            tic = time.perf_counter()
            scraped_urls = scraper.scraper(tbd_url, resp)
            toc = time.perf_counter()
            print(f"Took {toc - tic:0.4f} seconds to do scrape url")

            tic = time.perf_counter()
            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            toc = time.perf_counter()
            print(f"Took {toc - tic:0.4f} seconds to do store stuffs")

            while(start + self.config.time_delay > time.perf_counter()){
                time.sleep(self.config.time_delay/5)
                self.frontier.release_polite(tbd_url)
            }