Add files via upload

First Upload
This commit is contained in:
iNocturnis
2022-04-15 17:55:11 -07:00
committed by GitHub
commit e19f68a6a6
16 changed files with 689 additions and 0 deletions

View File

@@ -0,0 +1,26 @@
from utils import get_logger
from crawler.frontier import Frontier
from crawler.worker import Worker
class Crawler(object):
def __init__(self, config, restart, frontier_factory=Frontier, worker_factory=Worker):
self.config = config
self.logger = get_logger("CRAWLER")
self.frontier = frontier_factory(config, restart)
self.workers = list()
self.worker_factory = worker_factory
def start_async(self):
self.workers = [
self.worker_factory(worker_id, self.config, self.frontier)
for worker_id in range(self.config.threads_count)]
for worker in self.workers:
worker.start()
def start(self):
self.start_async()
self.join()
def join(self):
for worker in self.workers:
worker.join()

View File

@@ -0,0 +1,72 @@
import os
import shelve
from threading import Thread, RLock
from queue import Queue, Empty
from utils import get_logger, get_urlhash, normalize
from scraper import is_valid
class Frontier(object):
def __init__(self, config, restart):
self.logger = get_logger("FRONTIER")
self.config = config
self.to_be_downloaded = list()
if not os.path.exists(self.config.save_file) and not restart:
# Save file does not exist, but request to load save.
self.logger.info(
f"Did not find save file {self.config.save_file}, "
f"starting from seed.")
elif os.path.exists(self.config.save_file) and restart:
# Save file does exists, but request to start from seed.
self.logger.info(
f"Found save file {self.config.save_file}, deleting it.")
os.remove(self.config.save_file)
# Load existing save file, or create one if it does not exist.
self.save = shelve.open(self.config.save_file)
if restart:
for url in self.config.seed_urls:
self.add_url(url)
else:
# Set the frontier state with contents of save file.
self._parse_save_file()
if not self.save:
for url in self.config.seed_urls:
self.add_url(url)
def _parse_save_file(self):
''' This function can be overridden for alternate saving techniques. '''
total_count = len(self.save)
tbd_count = 0
for url, completed in self.save.values():
if not completed and is_valid(url):
self.to_be_downloaded.append(url)
tbd_count += 1
self.logger.info(
f"Found {tbd_count} urls to be downloaded from {total_count} "
f"total urls discovered.")
def get_tbd_url(self):
try:
return self.to_be_downloaded.pop()
except IndexError:
return None
def add_url(self, url):
url = normalize(url)
urlhash = get_urlhash(url)
if urlhash not in self.save:
self.save[urlhash] = (url, False)
self.save.sync()
self.to_be_downloaded.append(url)
def mark_url_complete(self, url):
urlhash = get_urlhash(url)
if urlhash not in self.save:
# This should not happen.
self.logger.error(
f"Completed url {url}, but have not seen it before.")
self.save[urlhash] = (url, True)
self.save.sync()

View File

@@ -0,0 +1,33 @@
from threading import Thread
from inspect import getsource
from utils.download import download
from utils import get_logger
import scraper
import time
class Worker(Thread):
def __init__(self, worker_id, config, frontier):
self.logger = get_logger(f"Worker-{worker_id}", "Worker")
self.config = config
self.frontier = frontier
# basic check for requests in scraper
assert {getsource(scraper).find(req) for req in {"from requests import", "import requests"}} == {-1}, "Do not use requests from scraper.py"
super().__init__(daemon=True)
def run(self):
while True:
tbd_url = self.frontier.get_tbd_url()
if not tbd_url:
self.logger.info("Frontier is empty. Stopping Crawler.")
break
resp = download(tbd_url, self.config, self.logger)
self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.")
scraped_urls = scraper.scraper(tbd_url, resp)
for scraped_url in scraped_urls:
self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url)
time.sleep(self.config.time_delay)