client: cleanup

This commit is contained in:
2021-03-21 13:10:57 +01:00
parent d8a5765fd8
commit 638576f471
5 changed files with 77 additions and 128 deletions

View File

@@ -1,6 +1,7 @@
import logging
import os
import re
from abc import ABC, abstractmethod
from typing import Dict
from urllib.parse import urlparse
@@ -10,21 +11,7 @@ from client.Retriever import Retriever
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
def parse_uri(uri: str):
parsed = urlparse(uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + uri)
host = parsed.netloc
path = parsed.path
if len(path) == 0 or path[0] != '/':
path = "/" + path
return host, path
class ResponseHandler:
class ResponseHandler(ABC):
client: HTTPClient
headers: Dict[str, str]
status_code: int
@@ -38,6 +25,7 @@ class ResponseHandler:
self.retriever = retriever
pass
@abstractmethod
def handle(self):
pass
@@ -133,8 +121,22 @@ class ResponseHandler:
logging.error("Invalid content-length value: %r", next_value)
raise InvalidResponse()
@staticmethod
def parse_uri(uri: str):
parsed = urlparse(uri)
class DownloadHandler(ResponseHandler):
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + uri)
host = parsed.netloc
path = parsed.path
if len(path) == 0 or path[0] != '/':
path = "/" + path
return host, path
class DownloadHandler(ResponseHandler, ABC):
path: str
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
@@ -152,9 +154,6 @@ class DownloadHandler(ResponseHandler):
return HTMLDownloadHandler(retriever, client, headers, url, dir)
return RawDownloadHandler(retriever, client, headers, url, dir)
def handle(self) -> str:
pass
def _create_directory(self):
path = self._get_duplicate_name(os.path.abspath(self.client.host))
os.mkdir(path)
@@ -248,7 +247,7 @@ class HTMLDownloadHandler(DownloadHandler):
def __download_images(self, tmp_filename, target_filename):
(host, path) = parse_uri(self.url)
(host, path) = ResponseHandler.parse_uri(self.url)
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'html.parser')
@@ -286,7 +285,7 @@ class HTMLDownloadHandler(DownloadHandler):
img_path = parsed.path
else:
same_host = False
(img_host, img_path) = parse_uri(img_src)
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"