CN2021/client/response_handler.py

import logging
import os
from abc import ABC, abstractmethod
from typing import Dict
from urllib.parse import urlparse

import cssutils
from bs4 import BeautifulSoup, Tag

from client.httpclient import HTTPClient, FORMAT
from httplib import parser
from httplib.exceptions import InvalidResponse
from httplib.retriever import Retriever


class ResponseHandler(ABC):
    client: HTTPClient
    headers: Dict[str, str]
    status_code: int
    url: str
    retriever: Retriever

    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
        self.client = client
        self.headers = headers
        self.url = url
        self.retriever = retriever
        pass

    @abstractmethod
    def handle(self):
        pass

    @staticmethod
    def create(client: HTTPClient, headers, status_code, url):
        retriever = Retriever.create(client, headers)

        content_type = headers.get("content-type")
        if content_type and "text/html" in content_type:
            return HTMLDownloadHandler(retriever, client, headers, url)
        return RawDownloadHandler(retriever, client, headers, url)

    @staticmethod
    def parse_uri(uri: str):
        parsed = urlparse(uri)

        # If there is no netloc, the url is invalid, so prepend `//` and try again
        if parsed.netloc == "":
            parsed = urlparse("//" + uri)

        host = parsed.netloc
        path = parsed.path
        if len(path) == 0 or path[0] != '/':
            path = "/" + path
        return host, path


class DownloadHandler(ResponseHandler, ABC):
    path: str

    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
        super().__init__(retriever, client, headers, url)

        if not dir:
            dir = self._create_directory()

        self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))

    @staticmethod
    def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
        content_type = headers.get("content-type")
        if content_type and "text/html" in content_type:
            return HTMLDownloadHandler(retriever, client, headers, url, dir)
        return RawDownloadHandler(retriever, client, headers, url, dir)

    def _create_directory(self):
        path = self._get_duplicate_name(os.path.abspath(self.client.host))
        os.mkdir(path)
        return path

    def _get_duplicate_name(self, path):
        tmp_path = path
        i = 0
        while os.path.exists(tmp_path):
            i += 1
            tmp_path = "{path}.{counter}".format(path=path, counter=i)

        return tmp_path

    def get_filename(self):
        """Returns the filename to download the payload to.
        """
        filename = "index.html"

        parsed = urlparse(self.url)

        # If there is no netloc, the url is invalid, so prepend `//` and try again
        if parsed.netloc == "":
            parsed = urlparse("//" + self.url)

        # If the path contains a `/` get only the last part and use it as filename
        # If the path end with a `/`, it's a directory so ignore it.
        if len(parsed.path) != 0:
            index = parsed.path.rfind("/")
            if index == -1:
                filename = parsed.path
            elif parsed.path[-1] != "/":
                filename = parsed.path[index:]

        result = os.path.basename(filename).strip()
        if any(letter.isalnum() for letter in result):
            return result

        return "index.html"

    def _handle_sub_request(self, client, url):

        (version, status, _) = parser.get_status_line(client)
        logging.debug("Parsed status-line: version: %s, status: %s", version, status)
        headers = parser.get_headers(client)
        logging.debug("Parsed headers: %r", headers)

        if status != 200:
            raise InvalidResponse("Status not expected 200: " + str(status))

        retriever = Retriever.create(client, headers)
        handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))

        return handler.handle()


class RawDownloadHandler(DownloadHandler):

    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
        super().__init__(retriever, client, headers, url, dir)

    def handle(self) -> str:
        logging.debug("Retrieving payload")
        file = open(self.path, "wb")

        for buffer in self.retriever.retrieve():
            file.write(buffer)
        file.close()

        return self.path


class HTMLDownloadHandler(DownloadHandler):
    def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
        super().__init__(retriever, client, headers, url, dir)

    def handle(self) -> str:

        (dir, file) = os.path.split(self.path)
        tmp_filename = ".{file}.tmp".format(file=file)
        tmp_path = os.path.join(dir, tmp_filename)
        file = open(tmp_path, "wb")

        for buffer in self.retriever.retrieve():
            file.write(buffer)
        file.close()

        self._download_images(tmp_path, self.path)
        os.remove(tmp_path)
        return self.path

    def _download_images(self, tmp_filename, target_filename):

        (host, path) = ResponseHandler.parse_uri(self.url)
        with open(tmp_filename, "rb") as fp:
            soup = BeautifulSoup(fp, 'lxml')

            base_url = self.url
            base_element = soup.find("base")

            if base_element:
                base_url = base_element["href"]

            processed = {}
            tag: Tag
            for tag in soup.find_all("img"):
                try:
                    if tag["src"] in processed:
                        new_url = processed.get(tag["src"])
                    else:
                        new_url = self.__download_image(tag["src"], host, base_url)
                        processed[tag["src"]] = new_url
                    if new_url:
                        tag["src"] = new_url
                except Exception as e:
                    logging.debug(e)
                    logging.error("Failed to download image: %s, skipping...", tag["src"])

            for tag in soup.find_all("div"):
                if not tag.has_attr("style"):
                    continue
                style = cssutils.parseStyle(tag["style"])

                if "background" in style and "url(" in style["background"]:
                    el_name = "background"
                elif "background-image" in style and "url(" in style["background-image"]:
                    el_name = "background-image"
                else:
                    continue
                el = style[el_name]
                start = el.find("url(") + 4
                end = el.find(")", start)
                url = el[start:end].strip()

                try:
                    if url in processed:
                        new_url = url
                    else:
                        new_url = self.__download_image(url, host, base_url)
                        processed[url] = new_url
                    if new_url:
                        el = el[:start] + new_url + el[end:]
                        style[el_name] = el
                        tag["style"] = style.cssText
                except Exception as e:
                    logging.debug("Internal error", exc_info=e)
                    logging.error("Failed to download image: %s, skipping...", tag["src"])

        with open(target_filename, 'w') as file:
            file.write(str(soup))

    def __download_image(self, img_src, host, base_url):
        parsed = urlparse(img_src)

        logging.debug("Downloading image: %s", img_src)

        if parsed.scheme not in ("", "http"):
            # Not a valid url
            return None

        if len(parsed.netloc) == 0 and parsed.path != "/":
            # relative url, append base_url
            img_src = os.path.join(os.path.dirname(base_url), parsed.path)

        parsed = urlparse(img_src)

        # Check if the image is located on the same server
        if len(parsed.netloc) == 0 or parsed.netloc == host:
            same_host = True
            img_host = host
            img_path = parsed.path
        else:
            same_host = False
            (img_host, img_path) = ResponseHandler.parse_uri(img_src)

        message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
        message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
        message += "Host: {host}\r\n\r\n".format(host=host)
        message = message.encode(FORMAT)

        if same_host:
            client = self.client
            client.reset_request()
        else:
            client = HTTPClient(img_src)
            client.conn.connect((img_host, 80))
        client.conn.sendall(message)
        filename = self._handle_sub_request(client, img_host + img_path)

        if not same_host:
            client.close()

        return filename