webcrawler/spacetime-crawler4py-master/utils/download.py
iNocturnis e19f68a6a6
Add files via upload
First Upload
2022-04-15 17:55:11 -07:00

37 lines
1.4 KiB
Python

import requests
import cbor
import time
from utils.response import Response
def download(url, config, logger=None):
host, port = config.cache_server
#gotta check header first to make sure we are not downloading anything above 1MB, because 2MB is the avg size of a webpage with all images and scripts,
#roughly 1MB is the upper limit of just the html marking and stuff alone, that is essentially double the avg of a webpage without the other stuff
#
resp = requests.head(
f"http://{host}:{port}/",
params=[("q", f"{url}"), ("u", f"{config.user_agent}")])
if resp.headers['content-length'] and resp.headers['content-length'].isdigit():
if int(resp.headers['content-length']) > 1000000:
print(int(resp.headers['content-length']))
return Response({
"error": f"FILE TOO LARGE !",
"status": 606,
"url" : url})
resp = requests.get(
f"http://{host}:{port}/",
params=[("q", f"{url}"), ("u", f"{config.user_agent}")])
try:
if resp and resp.content:
return Response(cbor.loads(resp.content))
except (EOFError, ValueError) as e:
pass
logger.error(f"Spacetime Response error {resp} with url {url}.")
return Response({
"error": f"Spacetime Response error {resp} with url {url}.",
"status": resp.status_code,
"url": url})