This commit is contained in:
2021-03-27 16:30:53 +01:00
parent fdbd865889
commit 3615c56152
14 changed files with 280 additions and 110 deletions

View File

@@ -14,7 +14,7 @@ from httplib.message import ClientMessage as Message
from httplib.retriever import Retriever
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None):
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
handler = BasicResponseHandler(client, msg, command)
retriever = handler.handle()
@@ -23,9 +23,9 @@ def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None)
content_type = msg.headers.get("content-type")
if content_type and "text/html" in content_type:
handler = HTMLDownloadHandler(retriever, client, msg, command, dir)
handler = HTMLDownloadHandler(retriever, client, msg, command, directory)
else:
handler = RawDownloadHandler(retriever, client, msg, command, dir)
handler = RawDownloadHandler(retriever, client, msg, command, directory)
return handler.handle()
@@ -130,20 +130,20 @@ class BasicResponseHandler(ResponseHandler):
class DownloadHandler(ResponseHandler, ABC):
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, directory=None):
super().__init__(retriever, client, msg, cmd)
if not dir:
dir = self._create_directory()
if not directory:
directory = self._create_directory()
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
self.path = self._get_duplicate_name(os.path.join(directory, self.get_filename()))
@staticmethod
def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
def create(retriever: Retriever, client: HTTPClient, msg, cmd, directory=None):
content_type = msg.headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, msg, cmd, dir)
return RawDownloadHandler(retriever, client, msg, cmd, dir)
return HTMLDownloadHandler(retriever, client, msg, cmd, directory)
return RawDownloadHandler(retriever, client, msg, cmd, directory)
def _create_directory(self):
path = self._get_duplicate_name(os.path.abspath(self.client.host))
@@ -194,14 +194,14 @@ class RawDownloadHandler(DownloadHandler):
class HTMLDownloadHandler(DownloadHandler):
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
super().__init__(retriever, client, msg, cmd, dir)
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, directory=None):
super().__init__(retriever, client, msg, cmd, directory)
def handle(self) -> str:
(dir, file) = os.path.split(self.path)
(directory, file) = os.path.split(self.path)
tmp_filename = f".{file}.tmp"
tmp_path = os.path.join(dir, tmp_filename)
tmp_path = os.path.join(directory, tmp_filename)
file = open(tmp_path, "wb")
for buffer in self.retriever.retrieve():
@@ -217,11 +217,11 @@ class HTMLDownloadHandler(DownloadHandler):
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'lxml')
base_url = parser.base_url(self.cmd.uri)
base_element = soup.find("base")
base_url = self.cmd.uri
if base_element:
base_url = f"http://{self.cmd.host}" + base_element["href"]
base_url = parser.urljoin(self.cmd.uri, base_element["href"])
processed = {}
tag: Tag
@@ -241,22 +241,18 @@ class HTMLDownloadHandler(DownloadHandler):
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
with open(target_filename, 'w') as file:
file.write(str(soup))
file.write(soup.prettify(formatter="minimal"))
def __download_image(self, img_src, base_url):
"""
Download image from the specified `img_src` and `base_url`.
If the image is available, it will be downloaded to the directory of `self.path`
"""
logging.info("Downloading image: %s", img_src)
parsed = urlsplit(img_src)
if parsed.scheme not in ("", "http", "https"):
# Not a valid url
return None
if parsed.hostname is None:
if img_src[0] == "/":
img_src = f"http://{self.cmd.host}{img_src}"
else:
img_src = parser.absolute_url(base_url, img_src)
img_src = parser.urljoin(base_url, img_src)
if parsed.hostname is None or parsed.hostname == self.cmd.host:
port = self.cmd.port