CN2021/client/ResponseHandler.py

import logging
import os
import re
from typing import Dict
from urllib.parse import urlparse

from bs4 import BeautifulSoup

from client.Retriever import Retriever
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine


def handle(client: HTTPClient, url: str):
    logging.debug("Waiting for response")

    try:
        (version, status, _) = get_status_line(client)
        logging.debug("Parsed status-line: version: %s, status: %s", version, status)
        headers = get_headers(client)
        logging.debug("Parsed headers: %r", headers)

        response_handler = construct(client, headers, status, url)
        response_handler.handle()

    except InvalidResponse as e:
        logging.debug("Internal error: Response could not be parsed", exc_info=e)
        return
    except InvalidStatusLine as e:
        logging.debug("Internal error: Invalid status-line in response", exc_info=e)
        return
    except UnsupportedEncoding as e:
        logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
        return


def get_status_line(client: HTTPClient):
    line = client.read_line()

    split = list(filter(None, line.split(" ")))
    if len(split) < 3:
        raise InvalidStatusLine(line)

    # Check HTTP version
    http_version = split.pop(0)
    if len(http_version) < 8 or http_version[4] != "/":
        raise InvalidStatusLine(line)

    (name, version) = http_version[:4], http_version[5:]
    if name != "HTTP" or not re.match(r"1\.[0|1]", version):
        raise InvalidStatusLine(line)

    status = split.pop(0)
    if not re.match(r"\d{3}", status):
        raise InvalidStatusLine(line)
    status = int(status)
    if status < 100 or status > 999:
        raise InvalidStatusLine(line)

    reason = split.pop(0)
    return version, status, reason


def get_headers(client: HTTPClient):
    headers = []
    # first header after the status-line may not contain a space
    while True:
        line = client.read_line()
        if line[0].isspace():
            continue
        else:
            break

    while True:
        if line in ("\r\n", "\n", " "):
            break

        if line[0].isspace():
            headers[-1] = headers[-1].rstrip("\r\n")

        headers.append(line.lstrip())
        line = client.read_line()

    result = {}
    header_str = "".join(headers)
    for line in header_str.splitlines():
        pos = line.find(":")

        if pos <= 0 or pos >= len(line) - 1:
            continue

        (header, value) = map(str.strip, line.split(":", 1))
        check_next_header(result, header, value)
        result[header.lower()] = value.lower()

    return result


def check_next_header(headers, next_header: str, next_value: str):
    if next_header == "content-length":
        if "content-length" in headers:
            logging.error("Multiple content-length headers specified")
            raise InvalidResponse()
        if not next_value.isnumeric() or int(next_value) <= 0:
            logging.error("Invalid content-length value: %r", next_value)
            raise InvalidResponse()


def construct(client: HTTPClient, headers, status_code, url):
    # only chunked transfer-encoding is supported
    transfer_encoding = headers.get("transfer-encoding")
    if transfer_encoding and transfer_encoding != "chunked":
        raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
    chunked = transfer_encoding

    # content-encoding is not supported
    content_encoding = headers.get("content-encoding")
    if content_encoding:
        raise UnsupportedEncoding("content-encoding", content_encoding)

    retriever = Retriever.create(client, headers)

    content_type = headers.get("content-type")
    if content_type and "text/html" in content_type:
        return HTMLDownloadHandler(retriever, client, headers, url)
    return RawDownloadHandler(retriever, client, headers, url)


def parse_uri(uri: str):
    parsed = urlparse(uri)

    # If there is no netloc, the url is invalid, so prepend `//` and try again
    if parsed.netloc == "":
        parsed = urlparse("//" + uri)

    host = parsed.netloc
    path = parsed.path
    if len(path) == 0 or path[0] != '/':
        path = "/" + path
    return host, path


class ResponseHandler:
    client: HTTPClient
    headers: Dict[str, str]
    status_code: int
    url: str
    retriever: Retriever

    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
        self.client = client
        self.headers = headers
        self.url = url
        self.retriever = retriever
        pass

    def handle(self):
        pass


class DownloadHandler(ResponseHandler):
    path: str

    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
        super().__init__(retriever, client, headers, url)

        if not dir:
            dir = self._create_directory()

        self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))

    @staticmethod
    def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
        content_type = headers.get("content-type")
        if content_type and "text/html" in content_type:
            return HTMLDownloadHandler(retriever, client, headers, url, dir)
        return RawDownloadHandler(retriever, client, headers, url, dir)

    def handle(self) -> str:
        pass

    def _create_directory(self):
        path = self._get_duplicate_name(os.path.abspath(self.client.host))
        os.mkdir(path)
        return path

    def _get_duplicate_name(self, path):
        tmp_path = path
        i = 0
        while os.path.exists(tmp_path):
            i += 1
            tmp_path = "{path}.{counter}".format(path=path, counter=i)

        return tmp_path

    def get_filename(self):
        """Returns the filename to download the payload to.
        """
        filename = "index.html"

        parsed = urlparse(self.url)

        # If there is no netloc, the url is invalid, so prepend `//` and try again
        if parsed.netloc == "":
            parsed = urlparse("//" + self.url)

        # If the path contains a `/` get only the last part and use it as filename
        # If the path end with a `/`, it's a directory so ignore it.
        if len(parsed.path) != 0:
            index = parsed.path.rfind("/")
            if index == -1:
                filename = parsed.path
            elif parsed.path[-1] != "/":
                filename = parsed.path[index:]

        result = os.path.basename(filename).strip()
        if any(letter.isalnum() for letter in result):
            return result

        return "index.html"

    def _handle_sub_request(self, client, url):

        (version, status, _) = get_status_line(client)
        logging.debug("Parsed status-line: version: %s, status: %s", version, status)
        headers = get_headers(client)
        logging.debug("Parsed headers: %r", headers)

        if status != 200:
            raise InvalidResponse("Status not expected 200: " + str(status))

        retriever = Retriever.create(client, headers)
        handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))

        return handler.handle()


class RawDownloadHandler(DownloadHandler):

    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
        super().__init__(retriever, client, headers, url, dir)

    def handle(self) -> str:
        logging.debug("Retrieving payload")
        file = open(self.path, "wb")

        for buffer in self.retriever.retrieve():
            file.write(buffer)
        file.close()

        return self.path


class HTMLDownloadHandler(DownloadHandler):
    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
        super().__init__(retriever, client, headers, url, dir)

    def handle(self) -> str:

        (dir, file) = os.path.split(self.path)
        tmp_filename = ".{file}.tmp".format(file=file)
        tmp_path = os.path.join(dir, tmp_filename)
        file = open(tmp_path, "wb")

        for buffer in self.retriever.retrieve():
            file.write(buffer)
        file.close()

        self.__download_images(tmp_path, self.path)
        os.remove(tmp_path)
        return self.path

    def __download_images(self, tmp_filename, target_filename):

        (host, path) = parse_uri(self.url)
        with open(tmp_filename, "rb") as fp:
            soup = BeautifulSoup(fp, 'html.parser')

            for tag in soup.find_all("img"):
                try:
                    tag["src"] = self.__download_image(tag["src"], host, path)
                except Exception as e:
                    logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)

        with open(target_filename, 'w') as file:
            file.write(str(soup))

    def __download_image(self, img_src, host, path):
        parsed = urlparse(img_src)

        logging.debug("Downloading image: %s", img_src)

        same_host = True
        if len(parsed.netloc) == 0 or parsed.netloc == host:
            img_host = host
            if parsed.path[0] != "/":
                base = os.path.split(path)[0]
                if base[-1] != '/':
                    base += "/"
                img_path = base + parsed.path
            else:
                img_path = parsed.path
        else:
            same_host = False
            (img_host, img_path) = parse_uri(img_src)

        message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
        message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
        message += "Host: {host}\r\n\r\n".format(host=host)
        message = message.encode(FORMAT)

        if same_host:
            client = self.client
            client.reset_request()
        else:
            client = HTTPClient(img_src)
            client.connect((img_host, 80))
        client.sendall(message)
        filename = self._handle_sub_request(client, img_host + img_path)

        if not same_host:
            client.close()

        return filename