client: cleanup
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -10,21 +11,7 @@ from client.Retriever import Retriever
|
||||
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
|
||||
|
||||
|
||||
def parse_uri(uri: str):
|
||||
parsed = urlparse(uri)
|
||||
|
||||
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
||||
if parsed.netloc == "":
|
||||
parsed = urlparse("//" + uri)
|
||||
|
||||
host = parsed.netloc
|
||||
path = parsed.path
|
||||
if len(path) == 0 or path[0] != '/':
|
||||
path = "/" + path
|
||||
return host, path
|
||||
|
||||
|
||||
class ResponseHandler:
|
||||
class ResponseHandler(ABC):
|
||||
client: HTTPClient
|
||||
headers: Dict[str, str]
|
||||
status_code: int
|
||||
@@ -38,6 +25,7 @@ class ResponseHandler:
|
||||
self.retriever = retriever
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def handle(self):
|
||||
pass
|
||||
|
||||
@@ -133,8 +121,22 @@ class ResponseHandler:
|
||||
logging.error("Invalid content-length value: %r", next_value)
|
||||
raise InvalidResponse()
|
||||
|
||||
@staticmethod
|
||||
def parse_uri(uri: str):
|
||||
parsed = urlparse(uri)
|
||||
|
||||
class DownloadHandler(ResponseHandler):
|
||||
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
||||
if parsed.netloc == "":
|
||||
parsed = urlparse("//" + uri)
|
||||
|
||||
host = parsed.netloc
|
||||
path = parsed.path
|
||||
if len(path) == 0 or path[0] != '/':
|
||||
path = "/" + path
|
||||
return host, path
|
||||
|
||||
|
||||
class DownloadHandler(ResponseHandler, ABC):
|
||||
path: str
|
||||
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
||||
@@ -152,9 +154,6 @@ class DownloadHandler(ResponseHandler):
|
||||
return HTMLDownloadHandler(retriever, client, headers, url, dir)
|
||||
return RawDownloadHandler(retriever, client, headers, url, dir)
|
||||
|
||||
def handle(self) -> str:
|
||||
pass
|
||||
|
||||
def _create_directory(self):
|
||||
path = self._get_duplicate_name(os.path.abspath(self.client.host))
|
||||
os.mkdir(path)
|
||||
@@ -248,7 +247,7 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
|
||||
def __download_images(self, tmp_filename, target_filename):
|
||||
|
||||
(host, path) = parse_uri(self.url)
|
||||
(host, path) = ResponseHandler.parse_uri(self.url)
|
||||
with open(tmp_filename, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, 'html.parser')
|
||||
|
||||
@@ -286,7 +285,7 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
img_path = parsed.path
|
||||
else:
|
||||
same_host = False
|
||||
(img_host, img_path) = parse_uri(img_src)
|
||||
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
|
||||
|
||||
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
|
||||
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
||||
|
Reference in New Issue
Block a user