update

2021-03-27 16:30:53 +01:00
parent fdbd865889
commit 3615c56152
14 changed files with 280 additions and 110 deletions
--- a/client/response_handler.py
+++ b/client/response_handler.py
@@ -14,7 +14,7 @@ from httplib.message import ClientMessage as Message
 from httplib.retriever import Retriever


-def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None):
+def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
    handler = BasicResponseHandler(client, msg, command)
    retriever = handler.handle()

@@ -23,9 +23,9 @@ def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None)

    content_type = msg.headers.get("content-type")
    if content_type and "text/html" in content_type:
-        handler = HTMLDownloadHandler(retriever, client, msg, command, dir)
+        handler = HTMLDownloadHandler(retriever, client, msg, command, directory)
    else:
-        handler = RawDownloadHandler(retriever, client, msg, command, dir)
+        handler = RawDownloadHandler(retriever, client, msg, command, directory)

    return handler.handle()

@@ -130,20 +130,20 @@ class BasicResponseHandler(ResponseHandler):

 class DownloadHandler(ResponseHandler, ABC):

-    def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
+    def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, directory=None):
        super().__init__(retriever, client, msg, cmd)

-        if not dir:
-            dir = self._create_directory()
+        if not directory:
+            directory = self._create_directory()

-        self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
+        self.path = self._get_duplicate_name(os.path.join(directory, self.get_filename()))

    @staticmethod
-    def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
+    def create(retriever: Retriever, client: HTTPClient, msg, cmd, directory=None):
        content_type = msg.headers.get("content-type")
        if content_type and "text/html" in content_type:
-            return HTMLDownloadHandler(retriever, client, msg, cmd, dir)
-        return RawDownloadHandler(retriever, client, msg, cmd, dir)
+            return HTMLDownloadHandler(retriever, client, msg, cmd, directory)
+        return RawDownloadHandler(retriever, client, msg, cmd, directory)

    def _create_directory(self):
        path = self._get_duplicate_name(os.path.abspath(self.client.host))
@@ -194,14 +194,14 @@ class RawDownloadHandler(DownloadHandler):


 class HTMLDownloadHandler(DownloadHandler):
-    def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
-        super().__init__(retriever, client, msg, cmd, dir)
+    def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, directory=None):
+        super().__init__(retriever, client, msg, cmd, directory)

    def handle(self) -> str:

-        (dir, file) = os.path.split(self.path)
+        (directory, file) = os.path.split(self.path)
        tmp_filename = f".{file}.tmp"
-        tmp_path = os.path.join(dir, tmp_filename)
+        tmp_path = os.path.join(directory, tmp_filename)
        file = open(tmp_path, "wb")

        for buffer in self.retriever.retrieve():
@@ -217,11 +217,11 @@ class HTMLDownloadHandler(DownloadHandler):
        with open(tmp_filename, "rb") as fp:
            soup = BeautifulSoup(fp, 'lxml')

-            base_url = parser.base_url(self.cmd.uri)
            base_element = soup.find("base")

+            base_url = self.cmd.uri
            if base_element:
-                base_url = f"http://{self.cmd.host}" + base_element["href"]
+                base_url = parser.urljoin(self.cmd.uri, base_element["href"])

            processed = {}
            tag: Tag
@@ -241,22 +241,18 @@ class HTMLDownloadHandler(DownloadHandler):
                    logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)

        with open(target_filename, 'w') as file:
-            file.write(str(soup))
+            file.write(soup.prettify(formatter="minimal"))

    def __download_image(self, img_src, base_url):
+        """
+        Download image from the specified `img_src` and `base_url`.
+        If the image is available, it will be downloaded to the directory of `self.path`
+        """
+
        logging.info("Downloading image: %s", img_src)

        parsed = urlsplit(img_src)
-
-        if parsed.scheme not in ("", "http", "https"):
-            # Not a valid url
-            return None
-
-        if parsed.hostname is None:
-            if img_src[0] == "/":
-                img_src = f"http://{self.cmd.host}{img_src}"
-            else:
-                img_src = parser.absolute_url(base_url, img_src)
+        img_src = parser.urljoin(base_url, img_src)

        if parsed.hostname is None or parsed.hostname == self.cmd.host:
            port = self.cmd.port