CN2021/client/response_handler.py

import logging
import os
import re
from abc import ABC, abstractmethod
from urllib.parse import urlsplit, unquote

from client.command import AbstractCommand, GetCommand
from client.httpclient import HTTPClient
from httplib import parser
from httplib.exceptions import InvalidResponse
from httplib.httpsocket import FORMAT
from httplib.message import ClientMessage as Message
from httplib.retriever import Retriever

BASE_REGEX = re.compile(r"<\s*base[^>]*\shref\s*=\s*['\"]([^\"']+)['\"][^>]*>", re.M | re.I)
IMG_REGEX = re.compile(r"<\s*img[^>]*\ssrc\s*=\s*['\"]([^\"']+)['\"][^>]*>", re.M | re.I)


def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
    handler = BasicResponseHandler(client, msg, command)
    retriever = handler.handle()

    if retriever is None:
        return

    content_type = msg.headers.get("content-type")
    if content_type and "text/html" in content_type:
        handler = HTMLDownloadHandler(retriever, client, msg, command, directory)
    else:
        handler = RawDownloadHandler(retriever, client, msg, command, directory)

    return handler.handle()


class ResponseHandler(ABC):
    client: HTTPClient
    retriever: Retriever
    msg: Message
    cmd: AbstractCommand

    def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd):
        self.client = client
        self.retriever = retriever
        self.msg = msg
        self.cmd = cmd

    @abstractmethod
    def handle(self):
        pass


class BasicResponseHandler(ResponseHandler):
    """
    Response handler which throws away the body and only shows the headers.
    In case of a redirect, it will process it and pass it to the appropriate response handler.
    """

    def __init__(self, client: HTTPClient, msg: Message, cmd: AbstractCommand):
        retriever = Retriever.create(client, msg.headers)
        super().__init__(retriever, client, msg, cmd)

    def handle(self):
        return self._handle_status()

    def _skip_body(self):
        logging.debug("Skipping body: [")
        for line in self.retriever.retrieve():
            try:
                logging.debug("%s", line.decode(FORMAT))
            except Exception:
                logging.debug("%r", line)

        logging.debug("] done.")

    def _handle_status(self):
        logging.info("%d %s", self.msg.status, self.msg.msg)

        if self.msg.status == 101:
            # Switching protocols is not supported
            print("".join(self.msg.raw), end="")
            return

        if 200 <= self.msg.status < 300:
            return self.retriever

        if 300 <= self.msg.status < 400:
            # Redirect
            self._skip_body()
            return self._do_handle_redirect()
        if 400 <= self.msg.status < 600:
            self._skip_body()
            # Dump headers and exit with error
            if not self.cmd.sub_request:
                print("".join(self.msg.raw), end="")
            return None

        return None

    def _do_handle_redirect(self):
        if self.msg.status == 304:
            print("".join(self.msg.raw), end="")
            return None

        location = self.msg.headers.get("location")
        if not location or len(location.strip()) == 0:
            raise InvalidResponse("No location in redirect")

        location = parser.urljoin(self.cmd.uri, location)
        parsed_location = urlsplit(location)
        if not parsed_location.hostname:
            raise InvalidResponse("Invalid location")

        if not parsed_location.scheme == "http":
            raise InvalidResponse("Only http is supported")

        self.cmd.uri = location
        self.cmd.host, self.cmd.port, self.cmd.path = parser.parse_uri(location)

        if self.msg.status == 301:
            logging.info("Status 301. Closing socket [%s]", self.cmd.host)
            self.client.close()

        self.cmd.execute()

        return None


class DownloadHandler(ResponseHandler, ABC):

    def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, directory=None):
        super().__init__(retriever, client, msg, cmd)

        if not directory:
            directory = self._create_directory()

        self.path = self._get_duplicate_name(os.path.join(directory, self.get_filename()))

    @staticmethod
    def create(retriever: Retriever, client: HTTPClient, msg, cmd, directory=None):
        content_type = msg.headers.get("content-type")
        if content_type and "text/html" in content_type:
            return HTMLDownloadHandler(retriever, client, msg, cmd, directory)
        return RawDownloadHandler(retriever, client, msg, cmd, directory)

    def _create_directory(self):
        path = self._get_duplicate_name(os.path.abspath(self.client.host))
        os.mkdir(path)
        return path

    def _get_duplicate_name(self, path):
        tmp_path = path
        i = 0
        while os.path.exists(tmp_path):
            i += 1
            tmp_path = "{path}.{counter}".format(path=path, counter=i)

        return tmp_path

    def get_filename(self):
        """Returns the filename to download the payload to.
        """
        filename = os.path.basename(self.cmd.path)
        if filename == '':
            return "index.html"

        while "%" in filename:
            filename = unquote(filename)

        filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
        result = os.path.basename(filename).strip()
        if any(letter.isalnum() for letter in result):
            return result

        return "index.html"


class RawDownloadHandler(DownloadHandler):

    def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
        super().__init__(retriever, client, msg, cmd, dir)

    def handle(self) -> str:
        logging.debug("Retrieving payload")
        file = open(self.path, "wb")

        for buffer in self.retriever.retrieve():
            file.write(buffer)
        file.close()

        return self.path


class HTMLDownloadHandler(DownloadHandler):
    def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, directory=None):
        super().__init__(retriever, client, msg, cmd, directory)

    def handle(self) -> str:

        (directory, file) = os.path.split(self.path)
        tmp_filename = f".{file}.tmp"
        tmp_path = os.path.join(directory, tmp_filename)
        file = open(tmp_path, "wb")

        for buffer in self.retriever.retrieve():
            file.write(buffer)
        file.close()

        charset = parser.get_charset(self.msg.headers)
        self._download_images(tmp_path, self.path, charset)
        os.remove(tmp_path)
        return self.path

    def _download_images(self, tmp_filename, target_filename, charset=FORMAT):

        try:
            fp = open(tmp_filename, "r", encoding=charset)
            html = fp.read()
        except UnicodeDecodeError:
            fp = open(tmp_filename, "r", encoding=FORMAT, errors="replace")
            html = fp.read()

        fp.close()

        base_element = BASE_REGEX.search(html)
        base_url = self.cmd.uri
        if base_element:
            base_url = parser.urljoin(self.cmd.uri, base_element.group(1))

        processed = {}
        to_replace = []

        for m in IMG_REGEX.finditer(html):
            url_start = m.start(1)
            url_end = m.end(1)
            target = m.group(1)

            try:
                if len(target) == 0:
                    continue
                if target in processed:
                    new_url = processed.get(target)
                else:
                    new_url = self.__download_image(target, base_url)
                    if not new_url:
                        # Image failed to download
                        continue

                    processed[target] = new_url

                if new_url:
                    local_path = os.path.basename(new_url)
                    to_replace.append((url_start, url_end, local_path))

            except Exception as e:
                logging.error("Failed to download image: %s, skipping...", target, exc_info=e)

        to_replace.reverse()
        for (start, end, path) in to_replace:
            html = html[:start] + path + html[end:]

        with open(target_filename, 'w', encoding=FORMAT) as file:
            file.write(html)

    def __download_image(self, img_src, base_url):
        """
        Download image from the specified `img_src` and `base_url`.
        If the image is available, it will be downloaded to the directory of `self.path`
        """

        logging.info("Downloading image: %s", img_src)

        parsed = urlsplit(img_src)
        img_src = parser.urljoin(base_url, img_src)

        if parsed.hostname is None or parsed.hostname == self.cmd.host:
            port = self.cmd.port
        elif ":" in parsed.netloc:
            port = parsed.netloc.split(":", 1)[1]
        else:
            port = 80

        command = GetCommand(img_src, port, os.path.dirname(self.path))
        command.execute(True)

        return command.filename