webcrawler/spacetime-crawler4py-master/utils/download.py

import requests
import cbor
import time

from utils.response import Response

def download(url, config, logger=None):
    host, port = config.cache_server
    #gotta check header first to make sure we are not downloading anything above 1MB, because 2MB is the avg size of a webpage with all images and scripts,
    #roughly 1MB is the upper limit of just the html marking and stuff alone, that is essentially double the avg of a webpage without the other stuff
    #
    resp = requests.head(
        f"http://{host}:{port}/",
        params=[("q", f"{url}"), ("u", f"{config.user_agent}")])

    if resp.headers['content-length'] and resp.headers['content-length'].isdigit():
        if int(resp.headers['content-length']) > 1000000:
            print(int(resp.headers['content-length']))
            return Response({
                "error": f"FILE TOO LARGE !",
                "status": 606,
                "url" : url})

    resp = requests.get(
        f"http://{host}:{port}/",
        params=[("q", f"{url}"), ("u", f"{config.user_agent}")])
    try:
        if resp and resp.content:
            return Response(cbor.loads(resp.content))
    except (EOFError, ValueError) as e:
        pass
    logger.error(f"Spacetime Response error {resp} with url {url}.")
    return Response({
        "error": f"Spacetime Response error {resp} with url {url}.",
        "status": resp.status_code,
        "url": url})