Update

2021-03-24 16:35:12 +01:00
parent 9ba7a030a7
commit d14252f707
10 changed files with 325 additions and 185 deletions
--- a/client/response_handler.py
+++ b/client/response_handler.py
@@ -2,52 +2,57 @@ import logging
 import os
 import re
 from abc import ABC, abstractmethod
-from typing import Dict
-from urllib.parse import urlparse, unquote
+from urllib.parse import urlsplit, unquote

-import cssutils
 from bs4 import BeautifulSoup, Tag

+from client.command import AbstractCommand, GetCommand
 from client.httpclient import HTTPClient, FORMAT
 from httplib import parser
 from httplib.exceptions import InvalidResponse
+from httplib.message import Message
 from httplib.retriever import Retriever


+def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None):
+    handler = BasicResponseHandler(client, msg, command)
+    retriever = handler.handle()
+
+    if retriever is None:
+        return
+
+    content_type = msg.headers.get("content-type")
+    if content_type and "text/html" in content_type:
+        handler = HTMLDownloadHandler(retriever, client, msg, command, dir)
+    else:
+        handler = RawDownloadHandler(retriever, client, msg, command, dir)
+
+    return handler.handle()
+
+
 class ResponseHandler(ABC):
    client: HTTPClient
-    headers: Dict[str, str]
-    status_code: int
-    url: str
    retriever: Retriever
+    msg: Message
+    cmd: AbstractCommand

-    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
+    def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd):
        self.client = client
-        self.headers = headers
-        self.url = url
        self.retriever = retriever
-        pass
+        self.msg = msg
+        self.cmd = cmd

    @abstractmethod
    def handle(self):
        pass

-    @staticmethod
-    def create(client: HTTPClient, headers, status_code, url):
-        retriever = Retriever.create(client, headers)
-
-        content_type = headers.get("content-type")
-        if content_type and "text/html" in content_type:
-            return HTMLDownloadHandler(retriever, client, headers, url)
-        return RawDownloadHandler(retriever, client, headers, url)
-
    @staticmethod
    def parse_uri(uri: str):
-        parsed = urlparse(uri)
+        parsed = urlsplit(uri)

        # If there is no netloc, the url is invalid, so prepend `//` and try again
        if parsed.netloc == "":
-            parsed = urlparse("//" + uri)
+            parsed = urlsplit("//" + uri)

        host = parsed.netloc
        path = parsed.path
@@ -56,11 +61,79 @@ class ResponseHandler(ABC):
        return host, path


-class DownloadHandler(ResponseHandler, ABC):
-    path: str
+class BasicResponseHandler(ResponseHandler):
+    """ Response handler which throws away the body and only shows the headers.
+    In case of a redirect, it will process it and pass it to the appropriate response handler.
+     """

-    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
-        super().__init__(retriever, client, headers, url)
+    def __init__(self, client: HTTPClient, msg: Message, cmd: AbstractCommand):
+        retriever = Retriever.create(client, msg.headers)
+        super().__init__(retriever, client, msg, cmd)
+
+    def handle(self):
+        return self._handle_status()
+
+    def _skip_body(self):
+        logging.debug("Skipping body: [")
+        for line in self.retriever.retrieve():
+            try:
+                logging.debug("%s", line.decode(FORMAT))
+            except Exception:
+                logging.debug("%r", line)
+
+        logging.debug("] done.")
+
+    def _handle_status(self):
+        logging.info("%d %s", self.msg.status, self.msg.msg)
+
+        if self.msg.status == 101:
+            # Switching protocols is not supported
+            print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
+            print(self.msg.headers)
+            return
+
+        if 200 <= self.msg.status < 300:
+            return self.retriever
+
+        if 300 <= self.msg.status < 400:
+            # Redirect
+            return self._do_handle_redirect()
+        if 400 <= self.msg.status < 500:
+            # Dump headers and exit with error
+            print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
+            print(self.msg.headers)
+            return None
+
+    def _do_handle_redirect(self):
+        self._skip_body()
+
+        location = self.msg.headers.get("location")
+        if not location:
+            raise InvalidResponse("No location in redirect")
+
+        parsed_location = urlsplit(location)
+        if not parsed_location.hostname:
+            raise InvalidResponse("Invalid location")
+
+        if not parsed_location.scheme == "http":
+            raise InvalidResponse("Only http is supported")
+
+        self.cmd.uri = location
+        self.cmd.host, self.cmd.port, self.cmd.path = parser.parse_uri(location)
+
+        if self.msg.status == 301:
+            logging.info("Status 301. Closing socket [%s]", self.cmd.host)
+            self.client.close()
+
+        self.cmd.execute()
+
+        return None
+
+
+class DownloadHandler(ResponseHandler, ABC):
+
+    def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
+        super().__init__(retriever, client, msg, cmd)

        if not dir:
            dir = self._create_directory()
@@ -68,11 +141,11 @@ class DownloadHandler(ResponseHandler, ABC):
        self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))

    @staticmethod
-    def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
-        content_type = headers.get("content-type")
+    def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
+        content_type = msg.headers.get("content-type")
        if content_type and "text/html" in content_type:
-            return HTMLDownloadHandler(retriever, client, headers, url, dir)
-        return RawDownloadHandler(retriever, client, headers, url, dir)
+            return HTMLDownloadHandler(retriever, client, msg, cmd, dir)
+        return RawDownloadHandler(retriever, client, msg, cmd, dir)

    def _create_directory(self):
        path = self._get_duplicate_name(os.path.abspath(self.client.host))
@@ -91,54 +164,25 @@ class DownloadHandler(ResponseHandler, ABC):
    def get_filename(self):
        """Returns the filename to download the payload to.
        """
-        filename = "index.html"
-
-        parsed = urlparse(self.url)
-
-        # If there is no netloc, the url is invalid, so prepend `//` and try again
-        if parsed.netloc == "":
-            parsed = urlparse("//" + self.url)
-
-        # If the path contains a `/` get only the last part and use it as filename
-        # If the path end with a `/`, it's a directory so ignore it.
-        if len(parsed.path) != 0:
-            index = parsed.path.rfind("/")
-            if index == -1:
-                filename = parsed.path
-            elif parsed.path[-1] != "/":
-                filename = parsed.path[index:]
+        filename = os.path.basename(self.cmd.path)
+        if filename == '':
+            return "index.html"

        while "%" in filename:
            filename = unquote(filename)

        filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
-
        result = os.path.basename(filename).strip()
        if any(letter.isalnum() for letter in result):
            return result

        return "index.html"

-    def _handle_sub_request(self, client, url):
-
-        (version, status, _) = parser.get_status_line(client)
-        logging.debug("Parsed status-line: version: %s, status: %s", version, status)
-        headers = parser.get_headers(client)
-        logging.debug("Parsed headers: %r", headers)
-
-        if status != 200:
-            raise InvalidResponse("Status not expected 200: " + str(status))
-
-        retriever = Retriever.create(client, headers)
-        handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
-
-        return handler.handle()
-

 class RawDownloadHandler(DownloadHandler):

-    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
-        super().__init__(retriever, client, headers, url, dir)
+    def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
+        super().__init__(retriever, client, msg, cmd, dir)

    def handle(self) -> str:
        logging.debug("Retrieving payload")
@@ -152,8 +196,8 @@ class RawDownloadHandler(DownloadHandler):


 class HTMLDownloadHandler(DownloadHandler):
-    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
-        super().__init__(retriever, client, headers, url, dir)
+    def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
+        super().__init__(retriever, client, msg, cmd, dir)

    def handle(self) -> str:

@@ -172,11 +216,11 @@ class HTMLDownloadHandler(DownloadHandler):

    def _download_images(self, tmp_filename, target_filename):

-        (host, path) = ResponseHandler.parse_uri(self.url)
+        (host, path) = ResponseHandler.parse_uri(self.cmd.uri)
        with open(tmp_filename, "rb") as fp:
            soup = BeautifulSoup(fp, 'lxml')

-            base_url = self.url
+            base_url = self.cmd.uri
            base_element = soup.find("base")

            if base_element:
@@ -186,58 +230,24 @@ class HTMLDownloadHandler(DownloadHandler):
            tag: Tag
            for tag in soup.find_all("img"):
                try:
-                    if tag.has_attr("src"):
-                        el_name = "src"
-                    elif tag.has_attr("data-src"):
-                        el_name = "data-src"
-                    else:
+                    if not tag.has_attr("src"):
                        continue

-                    if tag[el_name] in processed:
-                        new_url = processed.get(tag[el_name])
+                    if tag["src"] in processed:
+                        new_url = processed.get(tag["src"])
                    else:
-                        new_url = self.__download_image(tag[el_name], host, base_url)
-                        processed[tag[el_name]] = new_url
+                        new_url = self.__download_image(tag["src"], host, base_url)
+                        processed[tag["src"]] = new_url
                    if new_url:
-                        tag[el_name] = new_url
+                        tag["src"] = new_url
                except Exception as e:
-                    logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e)
-
-            for tag in soup.find_all("div"):
-                if not tag.has_attr("style"):
-                    continue
-                style = cssutils.parseStyle(tag["style"])
-
-                if "background" in style and "url(" in style["background"]:
-                    el_name = "background"
-                elif "background-image" in style and "url(" in style["background-image"]:
-                    el_name = "background-image"
-                else:
-                    continue
-                el = style[el_name]
-                start = el.find("url(") + 4
-                end = el.find(")", start)
-                url = el[start:end].strip()
-
-                try:
-                    if url in processed:
-                        new_url = url
-                    else:
-                        new_url = self.__download_image(url, host, base_url)
-                        processed[url] = new_url
-                    if new_url:
-                        el = el[:start] + new_url + el[end:]
-                        style[el_name] = el
-                        tag["style"] = style.cssText
-                except Exception as e:
-                    logging.debug("Internal error", exc_info=e)
-                    logging.error("Failed to download image: %s, skipping...", tag["src"])
+                    logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)

        with open(target_filename, 'w') as file:
            file.write(str(soup))

    def __download_image(self, img_src, host, base_url):
-        parsed = urlparse(img_src)
+        parsed = urlsplit(img_src)

        logging.debug("Downloading image: %s", img_src)

@@ -245,36 +255,18 @@ class HTMLDownloadHandler(DownloadHandler):
            # Not a valid url
            return None

+        if parsed.hostname == host:
+            port = self.cmd.port
+        elif ":" in parsed.netloc:
+            port = parsed.netloc.split(":", 1)[1]
+        else:
+            port = 80
+
        if len(parsed.netloc) == 0 and parsed.path != "/":
            # relative url, append base_url
            img_src = os.path.join(os.path.dirname(base_url), parsed.path)

-        parsed = urlparse(img_src)
+        command = GetCommand(img_src, port, os.path.dirname(self.path))
+        command.execute(True)

-        # Check if the image is located on the same server
-        if len(parsed.netloc) == 0 or parsed.netloc == host:
-            same_host = True
-            img_host = host
-            img_path = parsed.path
-        else:
-            same_host = False
-            (img_host, img_path) = ResponseHandler.parse_uri(img_src)
-
-        message = f"GET {img_path} HTTP/1.1\r\n"
-        message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
-        message += f"Host: {img_host}\r\n\r\n"
-        message = message.encode(FORMAT)
-
-        if same_host:
-            client = self.client
-            client.reset_request()
-        else:
-            client = HTTPClient(img_src)
-            client.conn.connect((img_host, 80))
-        client.conn.sendall(message)
-        filename = self._handle_sub_request(client, img_host + img_path)
-
-        if not same_host:
-            client.close()
-
-        return filename
+        return command.filename