import logging import os import re from typing import Dict from urllib.parse import urlparse from bs4 import BeautifulSoup from client.Retriever import Retriever from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine def handle(client: HTTPClient, url: str): logging.debug("Waiting for response") try: (version, status, _) = get_status_line(client) logging.debug("Parsed status-line: version: %s, status: %s", version, status) headers = get_headers(client) logging.debug("Parsed headers: %r", headers) response_handler = construct(client, headers, status, url) response_handler.handle() except InvalidResponse as e: logging.debug("Internal error: Response could not be parsed", exc_info=e) return except InvalidStatusLine as e: logging.debug("Internal error: Invalid status-line in response", exc_info=e) return except UnsupportedEncoding as e: logging.debug("Internal error: Unsupported encoding in response", exc_info=e) return def get_status_line(client: HTTPClient): line = client.read_line() split = list(filter(None, line.split(" "))) if len(split) < 3: raise InvalidStatusLine(line) # Check HTTP version http_version = split.pop(0) if len(http_version) < 8 or http_version[4] != "/": raise InvalidStatusLine(line) (name, version) = http_version[:4], http_version[5:] if name != "HTTP" or not re.match(r"1\.[0|1]", version): raise InvalidStatusLine(line) status = split.pop(0) if not re.match(r"\d{3}", status): raise InvalidStatusLine(line) status = int(status) if status < 100 or status > 999: raise InvalidStatusLine(line) reason = split.pop(0) return version, status, reason def get_headers(client: HTTPClient): headers = [] # first header after the status-line may not contain a space while True: line = client.read_line() if line[0].isspace(): continue else: break while True: if line in ("\r\n", "\n", " "): break if line[0].isspace(): headers[-1] = headers[-1].rstrip("\r\n") headers.append(line.lstrip()) line = client.read_line() result = {} header_str = "".join(headers) for line in header_str.splitlines(): pos = line.find(":") if pos <= 0 or pos >= len(line) - 1: continue (header, value) = map(str.strip, line.split(":", 1)) check_next_header(result, header, value) result[header.lower()] = value.lower() return result def check_next_header(headers, next_header: str, next_value: str): if next_header == "content-length": if "content-length" in headers: logging.error("Multiple content-length headers specified") raise InvalidResponse() if not next_value.isnumeric() or int(next_value) <= 0: logging.error("Invalid content-length value: %r", next_value) raise InvalidResponse() def construct(client: HTTPClient, headers, status_code, url): # only chunked transfer-encoding is supported transfer_encoding = headers.get("transfer-encoding") if transfer_encoding and transfer_encoding != "chunked": raise UnsupportedEncoding("transfer-encoding", transfer_encoding) chunked = transfer_encoding # content-encoding is not supported content_encoding = headers.get("content-encoding") if content_encoding: raise UnsupportedEncoding("content-encoding", content_encoding) retriever = Retriever.create(client, headers) content_type = headers.get("content-type") if content_type and "text/html" in content_type: return HTMLDownloadHandler(retriever, client, headers, url) return RawDownloadHandler(retriever, client, headers, url) def parse_uri(uri: str): parsed = urlparse(uri) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": parsed = urlparse("//" + uri) host = parsed.netloc path = parsed.path if len(path) == 0 or path[0] != '/': path = "/" + path return host, path class ResponseHandler: client: HTTPClient headers: Dict[str, str] status_code: int url: str retriever: Retriever def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str): self.client = client self.headers = headers self.url = url self.retriever = retriever pass def handle(self): pass class DownloadHandler(ResponseHandler): path: str def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): super().__init__(retriever, client, headers, url) if not dir: dir = self._create_directory() self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename())) @staticmethod def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): content_type = headers.get("content-type") if content_type and "text/html" in content_type: return HTMLDownloadHandler(retriever, client, headers, url, dir) return RawDownloadHandler(retriever, client, headers, url, dir) def handle(self) -> str: pass def _create_directory(self): path = self._get_duplicate_name(os.path.abspath(self.client.host)) os.mkdir(path) return path def _get_duplicate_name(self, path): tmp_path = path i = 0 while os.path.exists(tmp_path): i += 1 tmp_path = "{path}.{counter}".format(path=path, counter=i) return tmp_path def get_filename(self): """Returns the filename to download the payload to. """ filename = "index.html" parsed = urlparse(self.url) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": parsed = urlparse("//" + self.url) # If the path contains a `/` get only the last part and use it as filename # If the path end with a `/`, it's a directory so ignore it. if len(parsed.path) != 0: index = parsed.path.rfind("/") if index == -1: filename = parsed.path elif parsed.path[-1] != "/": filename = parsed.path[index:] result = os.path.basename(filename).strip() if any(letter.isalnum() for letter in result): return result return "index.html" def _handle_sub_request(self, client, url): (version, status, _) = get_status_line(client) logging.debug("Parsed status-line: version: %s, status: %s", version, status) headers = get_headers(client) logging.debug("Parsed headers: %r", headers) if status != 200: raise InvalidResponse("Status not expected 200: " + str(status)) retriever = Retriever.create(client, headers) handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path)) return handler.handle() class RawDownloadHandler(DownloadHandler): def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): super().__init__(retriever, client, headers, url, dir) def handle(self) -> str: logging.debug("Retrieving payload") file = open(self.path, "wb") for buffer in self.retriever.retrieve(): file.write(buffer) file.close() return self.path class HTMLDownloadHandler(DownloadHandler): def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): super().__init__(retriever, client, headers, url, dir) def handle(self) -> str: (dir, file) = os.path.split(self.path) tmp_filename = ".{file}.tmp".format(file=file) tmp_path = os.path.join(dir, tmp_filename) file = open(tmp_path, "wb") for buffer in self.retriever.retrieve(): file.write(buffer) file.close() self.__download_images(tmp_path, self.path) os.remove(tmp_path) return self.path def __download_images(self, tmp_filename, target_filename): (host, path) = parse_uri(self.url) with open(tmp_filename, "rb") as fp: soup = BeautifulSoup(fp, 'html.parser') for tag in soup.find_all("img"): try: tag["src"] = self.__download_image(tag["src"], host, path) except Exception as e: logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e) with open(target_filename, 'w') as file: file.write(str(soup)) def __download_image(self, img_src, host, path): parsed = urlparse(img_src) logging.debug("Downloading image: %s", img_src) same_host = True if len(parsed.netloc) == 0 or parsed.netloc == host: img_host = host if parsed.path[0] != "/": base = os.path.split(path)[0] if base[-1] != '/': base += "/" img_path = base + parsed.path else: img_path = parsed.path else: same_host = False (img_host, img_path) = parse_uri(img_src) message = "GET {path} HTTP/1.1\r\n".format(path=img_path) message += "Accept: */*\r\nAccept-Encoding: identity\r\n" message += "Host: {host}\r\n\r\n".format(host=host) message = message.encode(FORMAT) if same_host: client = self.client client.reset_request() else: client = HTTPClient(img_src) client.connect((img_host, 80)) client.sendall(message) filename = self._handle_sub_request(client, img_host + img_path) if not same_host: client.close() return filename