import logging import os from abc import ABC, abstractmethod from typing import Dict from urllib.parse import urlparse import cssutils from bs4 import BeautifulSoup, Tag from client.httpclient import HTTPClient, FORMAT from httplib import parser from httplib.exceptions import InvalidResponse from httplib.retriever import Retriever class ResponseHandler(ABC): client: HTTPClient headers: Dict[str, str] status_code: int url: str retriever: Retriever def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str): self.client = client self.headers = headers self.url = url self.retriever = retriever pass @abstractmethod def handle(self): pass @staticmethod def create(client: HTTPClient, headers, status_code, url): retriever = Retriever.create(client, headers) content_type = headers.get("content-type") if content_type and "text/html" in content_type: return HTMLDownloadHandler(retriever, client, headers, url) return RawDownloadHandler(retriever, client, headers, url) @staticmethod def parse_uri(uri: str): parsed = urlparse(uri) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": parsed = urlparse("//" + uri) host = parsed.netloc path = parsed.path if len(path) == 0 or path[0] != '/': path = "/" + path return host, path class DownloadHandler(ResponseHandler, ABC): path: str def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): super().__init__(retriever, client, headers, url) if not dir: dir = self._create_directory() self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename())) @staticmethod def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): content_type = headers.get("content-type") if content_type and "text/html" in content_type: return HTMLDownloadHandler(retriever, client, headers, url, dir) return RawDownloadHandler(retriever, client, headers, url, dir) def _create_directory(self): path = self._get_duplicate_name(os.path.abspath(self.client.host)) os.mkdir(path) return path def _get_duplicate_name(self, path): tmp_path = path i = 0 while os.path.exists(tmp_path): i += 1 tmp_path = "{path}.{counter}".format(path=path, counter=i) return tmp_path def get_filename(self): """Returns the filename to download the payload to. """ filename = "index.html" parsed = urlparse(self.url) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": parsed = urlparse("//" + self.url) # If the path contains a `/` get only the last part and use it as filename # If the path end with a `/`, it's a directory so ignore it. if len(parsed.path) != 0: index = parsed.path.rfind("/") if index == -1: filename = parsed.path elif parsed.path[-1] != "/": filename = parsed.path[index:] result = os.path.basename(filename).strip() if any(letter.isalnum() for letter in result): return result return "index.html" def _handle_sub_request(self, client, url): (version, status, _) = parser.get_status_line(client) logging.debug("Parsed status-line: version: %s, status: %s", version, status) headers = parser.get_headers(client) logging.debug("Parsed headers: %r", headers) if status != 200: raise InvalidResponse("Status not expected 200: " + str(status)) retriever = Retriever.create(client, headers) handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path)) return handler.handle() class RawDownloadHandler(DownloadHandler): def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): super().__init__(retriever, client, headers, url, dir) def handle(self) -> str: logging.debug("Retrieving payload") file = open(self.path, "wb") for buffer in self.retriever.retrieve(): file.write(buffer) file.close() return self.path class HTMLDownloadHandler(DownloadHandler): def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): super().__init__(retriever, client, headers, url, dir) def handle(self) -> str: (dir, file) = os.path.split(self.path) tmp_filename = ".{file}.tmp".format(file=file) tmp_path = os.path.join(dir, tmp_filename) file = open(tmp_path, "wb") for buffer in self.retriever.retrieve(): file.write(buffer) file.close() self._download_images(tmp_path, self.path) os.remove(tmp_path) return self.path def _download_images(self, tmp_filename, target_filename): (host, path) = ResponseHandler.parse_uri(self.url) with open(tmp_filename, "rb") as fp: soup = BeautifulSoup(fp, 'lxml') base_url = self.url base_element = soup.find("base") if base_element: base_url = base_element["href"] processed = {} tag: Tag for tag in soup.find_all("img"): try: if tag["src"] in processed: new_url = processed.get(tag["src"]) else: new_url = self.__download_image(tag["src"], host, base_url) processed[tag["src"]] = new_url if new_url: tag["src"] = new_url except Exception as e: logging.debug(e) logging.error("Failed to download image: %s, skipping...", tag["src"]) for tag in soup.find_all("div"): if not tag.has_attr("style"): continue style = cssutils.parseStyle(tag["style"]) if "background" in style and "url(" in style["background"]: el_name = "background" elif "background-image" in style and "url(" in style["background-image"]: el_name = "background-image" else: continue el = style[el_name] start = el.find("url(") + 4 end = el.find(")", start) url = el[start:end].strip() try: if url in processed: new_url = url else: new_url = self.__download_image(url, host, base_url) processed[url] = new_url if new_url: el = el[:start] + new_url + el[end:] style[el_name] = el tag["style"] = style.cssText except Exception as e: logging.debug("Internal error", exc_info=e) logging.error("Failed to download image: %s, skipping...", tag["src"]) with open(target_filename, 'w') as file: file.write(str(soup)) def __download_image(self, img_src, host, base_url): parsed = urlparse(img_src) logging.debug("Downloading image: %s", img_src) if parsed.scheme not in ("", "http"): # Not a valid url return None if len(parsed.netloc) == 0 and parsed.path != "/": # relative url, append base_url img_src = os.path.join(os.path.dirname(base_url), parsed.path) parsed = urlparse(img_src) # Check if the image is located on the same server if len(parsed.netloc) == 0 or parsed.netloc == host: same_host = True img_host = host img_path = parsed.path else: same_host = False (img_host, img_path) = ResponseHandler.parse_uri(img_src) message = "GET {path} HTTP/1.1\r\n".format(path=img_path) message += "Accept: */*\r\nAccept-Encoding: identity\r\n" message += "Host: {host}\r\n\r\n".format(host=host) message = message.encode(FORMAT) if same_host: client = self.client client.reset_request() else: client = HTTPClient(img_src) client.conn.connect((img_host, 80)) client.conn.sendall(message) filename = self._handle_sub_request(client, img_host + img_path) if not same_host: client.close() return filename