Client update

2021-03-19 03:29:35 +01:00
parent 1966a174bb
commit 797cdb0c0e
4 changed files with 425 additions and 37 deletions
--- a/client/ResponseHandler.py
+++ b/client/ResponseHandler.py
@@ -1,26 +1,110 @@
+import logging
 import os
-from socket import socket
 from typing import Dict
 from urllib.parse import urlparse

+from bs4 import BeautifulSoup
+
+from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
+
+
+def handle(client: HTTPClient, url: str):
+    logging.debug("Waiting for response")
+    try:
+        buffer = client.receive()
+    except TimeoutError:
+        print("[ABRT] Response timed out")
+        return
+
+    try:
+        (header_chunk, buffer) = client.get_crlf_chunk(buffer)
+        (status_line, headers) = client.parse_headers(header_chunk)
+        client.validate_status_line(status_line)
+
+        status_code = int(status_line.split(" ")[1])
+
+        response_handler = construct(client, headers, status_code, url)
+        response_handler.handle(buffer)
+
+
+    except InvalidResponse as e:
+        logging.debug("Internal error: Response could not be parsed", exc_info=e)
+        print("[ABRT] Invalid response")
+        return
+    except InvalidStatusLine as e:
+        logging.debug("Internal error: Invalid status-line in response", exc_info=e)
+        print("[ABRT] Invalid response")
+        return
+    except UnsupportedEncoding as e:
+        logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
+        print("[ABRT] Invalid response")
+        return
+
+
+def construct(client: HTTPClient, headers, status_code, url):
+    # only chunked transfer-encoding is supported
+    transfer_encoding = headers.get("transfer-encoding")
+    if transfer_encoding and transfer_encoding != "chunked":
+        raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
+    chunked = transfer_encoding
+
+    # content-encoding is not supported
+    content_encoding = headers.get("content-encoding")
+    if content_encoding:
+        raise UnsupportedEncoding("content-encoding", content_encoding)
+
+    if chunked:
+        return ChunkedResponseHandler(client, headers, status_code, url)
+    else:
+        content_type = headers.get("content-type")
+        if content_type and "text/html" in content_type:
+            return HTMLResponseHandler(client, headers, status_code, url)
+        return PlainResponseHandler(client, headers, status_code, url)
+
+
+def parse_uri(uri: str):
+    parsed = urlparse(uri)
+
+    # If there is no netloc, the url is invalid, so prepend `//` and try again
+    if parsed.netloc == "":
+        parsed = urlparse("//" + uri)
+
+    host = parsed.netloc
+    path = parsed.path
+    if len(path) == 0 or path[0] != '/':
+        path = "/" + path
+    return host, path
+

 class ResponseHandler:
-    client: socket
-    url: str
+    client: HTTPClient
    headers: Dict[str, str]
+    status_code: int
+    url: str

-    def __init__(self, url: str, client: socket):
-        self.headers = {}
-        self.url = url
+    def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str):
        self.client = client
+        self.headers = headers
+        self.status_code = status_code
+        self.url = url
        pass

-    def get_html_filename(self):
+    def handle(self, buffer: bytes):
+        pass
+
+    def get_filename(self):
+        """Returns the filename to download the payload to.
+        """
        filename = "index.html"

        parsed = urlparse(self.url)
+
+        # If there is no netloc, the url is invalid, so prepend `//` and try again
        if parsed.netloc == "":
            parsed = urlparse("//" + self.url)
+
+        # If the path contains a `/` get only the last part and use it as filename
+        # If the path end with a `/`, it's a directory so ignore it.
        if len(parsed.path) != 0:
            index = parsed.path.rfind("/")
            if index == -1:
@@ -29,14 +113,179 @@ class ResponseHandler:
                filename = parsed.path[index:]

        result = os.path.basename(filename).strip()
-        return result
+        if any(letter.isalnum() for letter in result):
+            return result
+
+        return "index.html"
+
+    def _handle_download(self, client, url):
+        logging.debug("Waiting for response")
+        try:
+            buffer = client.receive()
+        except TimeoutError:
+            print("[ABRT] Response timed out")
+            return
+
+        try:
+            (header_chunk, buffer) = client.get_crlf_chunk(buffer)
+            (status_line, headers) = client.parse_headers(header_chunk)
+            client.validate_status_line(status_line)
+
+            status_code = int(status_line.split(" ")[1])
+            if status_code != 200:
+                raise InvalidResponse("Code not 200")
+
+            response_handler = construct(client, headers, status_code, url)
+            filename = response_handler.handle(buffer)
+
+            return filename
+
+
+        except InvalidResponse as e:
+            logging.debug("Internal error: Response could not be parsed", exc_info=e)
+            print("[ABRT] Invalid response")
+            return
+        except InvalidStatusLine as e:
+            logging.debug("Internal error: Invalid status-line in response", exc_info=e)
+            print("[ABRT] Invalid response")
+            return
+        except UnsupportedEncoding as e:
+            logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
+            print("[ABRT] Invalid response")
+            return


 class PlainResponseHandler(ResponseHandler):
-    def __init__(self, url: str, client: socket):
-        super().__init__(url, client)
+    def __init__(self, client: HTTPClient, headers, status_code, url):
+        super().__init__(client, headers, status_code, url)
+
+    def _get_payload_size(self):
+        content_length = self.__get_content_length()
+        if content_length == 0:
+            logging.debug("content-length is 0")
+            return None
+
+        payload_size = content_length
+        if not content_length:
+            payload_size = -1
+            logging.debug("No content-length specified")
+        else:
+            logging.debug("Expected content-length=%s", payload_size)
+
+        return payload_size
+
+    def handle(self, buffer: bytes):
+        payload_size = self._get_payload_size()
+        if payload_size is None:
+            return
+
+        logging.debug("Retrieving payload")
+        filename = self.get_filename()
+        file = open(filename, "wb")
+        self._retrieve(file, buffer, payload_size)
+        file.close()
+
+        return filename
+
+    def _retrieve(self, file, buffer: bytes, payload_size: int):
+
+        file.write(buffer)
+
+        cur_payload_size = len(buffer)
+        while cur_payload_size < payload_size:
+            buffer = self.client.receive()
+            logging.debug("Received payload length: %s", len(buffer))
+
+            if len(buffer) == 0:
+                logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size)
+                break
+
+            cur_payload_size += len(buffer)
+            logging.debug("Processed payload: %r", cur_payload_size)
+            file.write(buffer)
+
+    def __get_content_length(self):
+        content_length = self.headers.get("content-length")
+        if not content_length:
+            return None
+
+        return int(content_length)
+
+
+class HTMLResponseHandler(PlainResponseHandler):
+    def __init__(self, client: HTTPClient, headers, status_code, url):
+        super().__init__(client, headers, status_code, url)
+
+    def handle(self, buffer: bytes):
+        payload_size = self._get_payload_size()
+        if payload_size is None:
+            return
+
+        logging.debug("Retrieving payload")
+        filename = self.get_filename()
+        tmp_filename = "." + filename + ".tmp"
+        file = open(tmp_filename, "wb")
+        self._retrieve(file, buffer, payload_size)
+        file.close()
+
+        self.__download_images(tmp_filename, filename)
+        os.remove(tmp_filename)
+        return filename
+
+    def __download_images(self, tmp_filename, target_filename):
+
+        (host, path) = parse_uri(self.url)
+        with open(tmp_filename, "r") as fp:
+            soup = BeautifulSoup(fp, "lxml")
+
+            for tag in soup.find_all("img"):
+                try:
+                    tag["src"] = self.__download_image(tag["src"], host, path)
+                except Exception as e:
+                    logging.error("Failed to download image, skipping...", exc_info=e)
+
+        with open(target_filename, 'w') as file:
+            file.write(str(soup))
+
+    def __download_image(self, img_src, host, path):
+        parsed = urlparse(img_src)
+
+        same_host = True
+        if len(parsed.netloc) == 0 or parsed.netloc == host:
+            img_host = host
+            if parsed.path[0] != "/":
+                base = os.path.split(path)[0]
+                if base[-1] != '/':
+                    base += "/"
+                img_path = base + parsed.path
+            else:
+                img_path = parsed.path
+        else:
+            same_host = False
+            (img_host, img_path) = parse_uri(img_src)
+
+        message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
+        message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
+        message += "Host: {host}\r\n\r\n".format(host=host)
+        message = message.encode(FORMAT)
+
+        if same_host:
+            client = self.client
+        else:
+            client = HTTPClient(img_src)
+            client.connect((img_host, 80))
+        client.sendall(message)
+        filename = self._handle_download(client, img_host + img_path)
+
+        if not same_host:
+            client.close()
+
+        return filename


 class ChunkedResponseHandler(ResponseHandler):
-    def __init__(self, url: str, client: socket):
-        super().__init__(url, client)
+    def __init__(self, client: HTTPClient, headers, status_code, url):
+        super().__init__(client, headers, status_code, url)
+
+    def handle(self, buffer: bytes):
+        return None