update

2021-03-21 23:01:09 +01:00
parent 638576f471
commit d25d2ef993
14 changed files with 681 additions and 226 deletions
--- a/client/response_handler.py
+++ b/client/response_handler.py
@@ -0,0 +1,225 @@
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import Dict
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+
+from client.httpclient import HTTPClient, FORMAT
+from httplib.retriever import Retriever
+from httplib import parser
+from httplib.exceptions import InvalidResponse
+
+
+class ResponseHandler(ABC):
+    client: HTTPClient
+    headers: Dict[str, str]
+    status_code: int
+    url: str
+    retriever: Retriever
+
+    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
+        self.client = client
+        self.headers = headers
+        self.url = url
+        self.retriever = retriever
+        pass
+
+    @abstractmethod
+    def handle(self):
+        pass
+
+    @staticmethod
+    def create(client: HTTPClient, headers, status_code, url):
+        retriever = Retriever.create(client, headers)
+
+        content_type = headers.get("content-type")
+        if content_type and "text/html" in content_type:
+            return HTMLDownloadHandler(retriever, client, headers, url)
+        return RawDownloadHandler(retriever, client, headers, url)
+
+    @staticmethod
+    def parse_uri(uri: str):
+        parsed = urlparse(uri)
+
+        # If there is no netloc, the url is invalid, so prepend `//` and try again
+        if parsed.netloc == "":
+            parsed = urlparse("//" + uri)
+
+        host = parsed.netloc
+        path = parsed.path
+        if len(path) == 0 or path[0] != '/':
+            path = "/" + path
+        return host, path
+
+
+class DownloadHandler(ResponseHandler, ABC):
+    path: str
+
+    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
+        super().__init__(retriever, client, headers, url)
+
+        if not dir:
+            dir = self._create_directory()
+
+        self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
+
+    @staticmethod
+    def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
+        content_type = headers.get("content-type")
+        if content_type and "text/html" in content_type:
+            return HTMLDownloadHandler(retriever, client, headers, url, dir)
+        return RawDownloadHandler(retriever, client, headers, url, dir)
+
+    def _create_directory(self):
+        path = self._get_duplicate_name(os.path.abspath(self.client.host))
+        os.mkdir(path)
+        return path
+
+    def _get_duplicate_name(self, path):
+        tmp_path = path
+        i = 0
+        while os.path.exists(tmp_path):
+            i += 1
+            tmp_path = "{path}.{counter}".format(path=path, counter=i)
+
+        return tmp_path
+
+    def get_filename(self):
+        """Returns the filename to download the payload to.
+        """
+        filename = "index.html"
+
+        parsed = urlparse(self.url)
+
+        # If there is no netloc, the url is invalid, so prepend `//` and try again
+        if parsed.netloc == "":
+            parsed = urlparse("//" + self.url)
+
+        # If the path contains a `/` get only the last part and use it as filename
+        # If the path end with a `/`, it's a directory so ignore it.
+        if len(parsed.path) != 0:
+            index = parsed.path.rfind("/")
+            if index == -1:
+                filename = parsed.path
+            elif parsed.path[-1] != "/":
+                filename = parsed.path[index:]
+
+        result = os.path.basename(filename).strip()
+        if any(letter.isalnum() for letter in result):
+            return result
+
+        return "index.html"
+
+    def _handle_sub_request(self, client, url):
+
+        (version, status, _) = parser.get_status_line(client)
+        logging.debug("Parsed status-line: version: %s, status: %s", version, status)
+        headers = parser.get_headers(client)
+        logging.debug("Parsed headers: %r", headers)
+
+        if status != 200:
+            raise InvalidResponse("Status not expected 200: " + str(status))
+
+        retriever = Retriever.create(client, headers)
+        handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
+
+        return handler.handle()
+
+
+class RawDownloadHandler(DownloadHandler):
+
+    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
+        super().__init__(retriever, client, headers, url, dir)
+
+    def handle(self) -> str:
+        logging.debug("Retrieving payload")
+        file = open(self.path, "wb")
+
+        for buffer in self.retriever.retrieve():
+            file.write(buffer)
+        file.close()
+
+        return self.path
+
+
+class HTMLDownloadHandler(DownloadHandler):
+    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
+        super().__init__(retriever, client, headers, url, dir)
+
+    def handle(self) -> str:
+
+        (dir, file) = os.path.split(self.path)
+        tmp_filename = ".{file}.tmp".format(file=file)
+        tmp_path = os.path.join(dir, tmp_filename)
+        file = open(tmp_path, "wb")
+
+        for buffer in self.retriever.retrieve():
+            file.write(buffer)
+        file.close()
+
+        self.__download_images(tmp_path, self.path)
+        os.remove(tmp_path)
+        return self.path
+
+    def __download_images(self, tmp_filename, target_filename):
+
+        (host, path) = ResponseHandler.parse_uri(self.url)
+        with open(tmp_filename, "rb") as fp:
+            soup = BeautifulSoup(fp, 'html.parser')
+
+            base_url = self.url
+            base_element = soup.find("base")
+
+            if base_element:
+                base_url = base_element["href"]
+
+            for tag in soup.find_all("img"):
+                try:
+                    tag["src"] = self.__download_image(tag["src"], host, base_url)
+                except Exception as e:
+                    logging.debug(e)
+                    logging.error("Failed to download image: %s, skipping...", tag["src"])
+
+        with open(target_filename, 'w') as file:
+            file.write(str(soup))
+
+    def __download_image(self, img_src, host, base_url):
+        parsed = urlparse(img_src)
+
+        logging.debug("Downloading image: %s", img_src)
+
+        if len(parsed.netloc) == 0 and parsed.path != "/":
+            # relative url, append base_url
+            img_src = os.path.join(os.path.dirname(base_url), parsed.path)
+
+        parsed = urlparse(img_src)
+
+        # Check if the image is located on the same server
+        if len(parsed.netloc) == 0 or parsed.netloc == host:
+            same_host = True
+            img_host = host
+            img_path = parsed.path
+        else:
+            same_host = False
+            (img_host, img_path) = ResponseHandler.parse_uri(img_src)
+
+        message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
+        message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
+        message += "Host: {host}\r\n\r\n".format(host=host)
+        message = message.encode(FORMAT)
+
+        if same_host:
+            client = self.client
+            client.reset_request()
+        else:
+            client = HTTPClient(img_src)
+            client.conn.connect((img_host, 80))
+        client.conn.sendall(message)
+        filename = self._handle_sub_request(client, img_host + img_path)
+
+        if not same_host:
+            client.close()
+
+        return filename