webcrawler/spacetime-crawler4py-master/crawler/frontier.py
iNocturnis e19f68a6a6
Add files via upload
First Upload
2022-04-15 17:55:11 -07:00

73 lines
2.6 KiB
Python

import os
import shelve
from threading import Thread, RLock
from queue import Queue, Empty
from utils import get_logger, get_urlhash, normalize
from scraper import is_valid
class Frontier(object):
def __init__(self, config, restart):
self.logger = get_logger("FRONTIER")
self.config = config
self.to_be_downloaded = list()
if not os.path.exists(self.config.save_file) and not restart:
# Save file does not exist, but request to load save.
self.logger.info(
f"Did not find save file {self.config.save_file}, "
f"starting from seed.")
elif os.path.exists(self.config.save_file) and restart:
# Save file does exists, but request to start from seed.
self.logger.info(
f"Found save file {self.config.save_file}, deleting it.")
os.remove(self.config.save_file)
# Load existing save file, or create one if it does not exist.
self.save = shelve.open(self.config.save_file)
if restart:
for url in self.config.seed_urls:
self.add_url(url)
else:
# Set the frontier state with contents of save file.
self._parse_save_file()
if not self.save:
for url in self.config.seed_urls:
self.add_url(url)
def _parse_save_file(self):
''' This function can be overridden for alternate saving techniques. '''
total_count = len(self.save)
tbd_count = 0
for url, completed in self.save.values():
if not completed and is_valid(url):
self.to_be_downloaded.append(url)
tbd_count += 1
self.logger.info(
f"Found {tbd_count} urls to be downloaded from {total_count} "
f"total urls discovered.")
def get_tbd_url(self):
try:
return self.to_be_downloaded.pop()
except IndexError:
return None
def add_url(self, url):
url = normalize(url)
urlhash = get_urlhash(url)
if urlhash not in self.save:
self.save[urlhash] = (url, False)
self.save.sync()
self.to_be_downloaded.append(url)
def mark_url_complete(self, url):
urlhash = get_urlhash(url)
if urlhash not in self.save:
# This should not happen.
self.logger.error(
f"Completed url {url}, but have not seen it before.")
self.save[urlhash] = (url, True)
self.save.sync()