import logging import os import re from abc import ABC, abstractmethod from urllib.parse import urlsplit, unquote from bs4 import BeautifulSoup, Tag from client.command import AbstractCommand, GetCommand from client.httpclient import HTTPClient, FORMAT from httplib import parser from httplib.exceptions import InvalidResponse from httplib.message import Message from httplib.retriever import Retriever def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None): handler = BasicResponseHandler(client, msg, command) retriever = handler.handle() if retriever is None: return content_type = msg.headers.get("content-type") if content_type and "text/html" in content_type: handler = HTMLDownloadHandler(retriever, client, msg, command, dir) else: handler = RawDownloadHandler(retriever, client, msg, command, dir) return handler.handle() class ResponseHandler(ABC): client: HTTPClient retriever: Retriever msg: Message cmd: AbstractCommand def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd): self.client = client self.retriever = retriever self.msg = msg self.cmd = cmd @abstractmethod def handle(self): pass @staticmethod def parse_uri(uri: str): parsed = urlsplit(uri) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": parsed = urlsplit("//" + uri) host = parsed.netloc path = parsed.path if len(path) == 0 or path[0] != '/': path = "/" + path return host, path class BasicResponseHandler(ResponseHandler): """ Response handler which throws away the body and only shows the headers. In case of a redirect, it will process it and pass it to the appropriate response handler. """ def __init__(self, client: HTTPClient, msg: Message, cmd: AbstractCommand): retriever = Retriever.create(client, msg.headers) super().__init__(retriever, client, msg, cmd) def handle(self): return self._handle_status() def _skip_body(self): logging.debug("Skipping body: [") for line in self.retriever.retrieve(): try: logging.debug("%s", line.decode(FORMAT)) except Exception: logging.debug("%r", line) logging.debug("] done.") def _handle_status(self): logging.info("%d %s", self.msg.status, self.msg.msg) if self.msg.status == 101: # Switching protocols is not supported print(f"{self.msg.version} {self.msg.status} {self.msg.msg}") print(self.msg.headers) return if 200 <= self.msg.status < 300: return self.retriever if 300 <= self.msg.status < 400: # Redirect return self._do_handle_redirect() if 400 <= self.msg.status < 500: # Dump headers and exit with error print(f"{self.msg.version} {self.msg.status} {self.msg.msg}") print(self.msg.headers) return None def _do_handle_redirect(self): self._skip_body() location = self.msg.headers.get("location") if not location: raise InvalidResponse("No location in redirect") parsed_location = urlsplit(location) if not parsed_location.hostname: raise InvalidResponse("Invalid location") if not parsed_location.scheme == "http": raise InvalidResponse("Only http is supported") self.cmd.uri = location self.cmd.host, self.cmd.port, self.cmd.path = parser.parse_uri(location) if self.msg.status == 301: logging.info("Status 301. Closing socket [%s]", self.cmd.host) self.client.close() self.cmd.execute() return None class DownloadHandler(ResponseHandler, ABC): def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None): super().__init__(retriever, client, msg, cmd) if not dir: dir = self._create_directory() self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename())) @staticmethod def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None): content_type = msg.headers.get("content-type") if content_type and "text/html" in content_type: return HTMLDownloadHandler(retriever, client, msg, cmd, dir) return RawDownloadHandler(retriever, client, msg, cmd, dir) def _create_directory(self): path = self._get_duplicate_name(os.path.abspath(self.client.host)) os.mkdir(path) return path def _get_duplicate_name(self, path): tmp_path = path i = 0 while os.path.exists(tmp_path): i += 1 tmp_path = "{path}.{counter}".format(path=path, counter=i) return tmp_path def get_filename(self): """Returns the filename to download the payload to. """ filename = os.path.basename(self.cmd.path) if filename == '': return "index.html" while "%" in filename: filename = unquote(filename) filename = re.sub(r"[^\w.+-]+[.]*", '', filename) result = os.path.basename(filename).strip() if any(letter.isalnum() for letter in result): return result return "index.html" class RawDownloadHandler(DownloadHandler): def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None): super().__init__(retriever, client, msg, cmd, dir) def handle(self) -> str: logging.debug("Retrieving payload") file = open(self.path, "wb") for buffer in self.retriever.retrieve(): file.write(buffer) file.close() return self.path class HTMLDownloadHandler(DownloadHandler): def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None): super().__init__(retriever, client, msg, cmd, dir) def handle(self) -> str: (dir, file) = os.path.split(self.path) tmp_filename = f".{file}.tmp" tmp_path = os.path.join(dir, tmp_filename) file = open(tmp_path, "wb") for buffer in self.retriever.retrieve(): file.write(buffer) file.close() self._download_images(tmp_path, self.path) os.remove(tmp_path) return self.path def _download_images(self, tmp_filename, target_filename): (host, path) = ResponseHandler.parse_uri(self.cmd.uri) with open(tmp_filename, "rb") as fp: soup = BeautifulSoup(fp, 'lxml') base_url = self.cmd.uri base_element = soup.find("base") if base_element: base_url = base_element["href"] processed = {} tag: Tag for tag in soup.find_all("img"): try: if not tag.has_attr("src"): continue if tag["src"] in processed: new_url = processed.get(tag["src"]) else: new_url = self.__download_image(tag["src"], host, base_url) processed[tag["src"]] = new_url if new_url: tag["src"] = os.path.basename(new_url) except Exception as e: logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e) with open(target_filename, 'w') as file: file.write(str(soup)) def __download_image(self, img_src, host, base_url): logging.debug("Downloading image: %s", img_src) parsed = urlsplit(img_src) if parsed.scheme not in ("", "http", "https"): # Not a valid url return None if parsed.hostname is None: if img_src[0] == "/": img_src = host + img_src else: img_src = os.path.join(os.path.dirname(base_url), img_src) if parsed.hostname is None or parsed.hostname == host: port = self.cmd.port elif ":" in parsed.netloc: port = parsed.netloc.split(":", 1)[1] else: port = 80 command = GetCommand(img_src, port, os.path.dirname(self.path)) command.execute(True) return command.filename