webcrawler/spacetime-crawler4py-master/crawler/worker.py
iNocturnis e19f68a6a6
Add files via upload
First Upload
2022-04-15 17:55:11 -07:00

34 lines
1.3 KiB
Python

from threading import Thread
from inspect import getsource
from utils.download import download
from utils import get_logger
import scraper
import time
class Worker(Thread):
def __init__(self, worker_id, config, frontier):
self.logger = get_logger(f"Worker-{worker_id}", "Worker")
self.config = config
self.frontier = frontier
# basic check for requests in scraper
assert {getsource(scraper).find(req) for req in {"from requests import", "import requests"}} == {-1}, "Do not use requests from scraper.py"
super().__init__(daemon=True)
def run(self):
while True:
tbd_url = self.frontier.get_tbd_url()
if not tbd_url:
self.logger.info("Frontier is empty. Stopping Crawler.")
break
resp = download(tbd_url, self.config, self.logger)
self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.")
scraped_urls = scraper.scraper(tbd_url, resp)
for scraped_url in scraped_urls:
self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url)
time.sleep(self.config.time_delay)