import logging import os from typing import Dict from urllib.parse import urlparse from bs4 import BeautifulSoup from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine def handle(client: HTTPClient, url: str): logging.debug("Waiting for response") try: buffer = client.receive() except TimeoutError: print("[ABRT] Response timed out") return try: (header_chunk, buffer) = client.get_crlf_chunk(buffer) (status_line, headers) = client.parse_headers(header_chunk) client.validate_status_line(status_line) status_code = int(status_line.split(" ")[1]) response_handler = construct(client, headers, status_code, url) response_handler.handle(buffer) except InvalidResponse as e: logging.debug("Internal error: Response could not be parsed", exc_info=e) print("[ABRT] Invalid response") return except InvalidStatusLine as e: logging.debug("Internal error: Invalid status-line in response", exc_info=e) print("[ABRT] Invalid response") return except UnsupportedEncoding as e: logging.debug("Internal error: Unsupported encoding in response", exc_info=e) print("[ABRT] Invalid response") return def construct(client: HTTPClient, headers, status_code, url): # only chunked transfer-encoding is supported transfer_encoding = headers.get("transfer-encoding") if transfer_encoding and transfer_encoding != "chunked": raise UnsupportedEncoding("transfer-encoding", transfer_encoding) chunked = transfer_encoding # content-encoding is not supported content_encoding = headers.get("content-encoding") if content_encoding: raise UnsupportedEncoding("content-encoding", content_encoding) if chunked: return ChunkedResponseHandler(client, headers, status_code, url) else: content_type = headers.get("content-type") if content_type and "text/html" in content_type: return HTMLResponseHandler(client, headers, status_code, url) return PlainResponseHandler(client, headers, status_code, url) def parse_uri(uri: str): parsed = urlparse(uri) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": parsed = urlparse("//" + uri) host = parsed.netloc path = parsed.path if len(path) == 0 or path[0] != '/': path = "/" + path return host, path class ResponseHandler: client: HTTPClient headers: Dict[str, str] status_code: int url: str def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str): self.client = client self.headers = headers self.status_code = status_code self.url = url pass def handle(self, buffer: bytes): pass def get_filename(self): """Returns the filename to download the payload to. """ filename = "index.html" parsed = urlparse(self.url) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": parsed = urlparse("//" + self.url) # If the path contains a `/` get only the last part and use it as filename # If the path end with a `/`, it's a directory so ignore it. if len(parsed.path) != 0: index = parsed.path.rfind("/") if index == -1: filename = parsed.path elif parsed.path[-1] != "/": filename = parsed.path[index:] result = os.path.basename(filename).strip() if any(letter.isalnum() for letter in result): return result return "index.html" def _handle_download(self, client, url): logging.debug("Waiting for response") try: buffer = client.receive() except TimeoutError: print("[ABRT] Response timed out") return try: (header_chunk, buffer) = client.get_crlf_chunk(buffer) (status_line, headers) = client.parse_headers(header_chunk) client.validate_status_line(status_line) status_code = int(status_line.split(" ")[1]) if status_code != 200: raise InvalidResponse("Code not 200") response_handler = construct(client, headers, status_code, url) filename = response_handler.handle(buffer) return filename except InvalidResponse as e: logging.debug("Internal error: Response could not be parsed", exc_info=e) print("[ABRT] Invalid response") return except InvalidStatusLine as e: logging.debug("Internal error: Invalid status-line in response", exc_info=e) print("[ABRT] Invalid response") return except UnsupportedEncoding as e: logging.debug("Internal error: Unsupported encoding in response", exc_info=e) print("[ABRT] Invalid response") return class PlainResponseHandler(ResponseHandler): def __init__(self, client: HTTPClient, headers, status_code, url): super().__init__(client, headers, status_code, url) def _get_payload_size(self): content_length = self.__get_content_length() if content_length == 0: logging.debug("content-length is 0") return None payload_size = content_length if not content_length: payload_size = -1 logging.debug("No content-length specified") else: logging.debug("Expected content-length=%s", payload_size) return payload_size def handle(self, buffer: bytes): payload_size = self._get_payload_size() if payload_size is None: return logging.debug("Retrieving payload") filename = self.get_filename() file = open(filename, "wb") self._retrieve(file, buffer, payload_size) file.close() return filename def _retrieve(self, file, buffer: bytes, payload_size: int): file.write(buffer) cur_payload_size = len(buffer) while cur_payload_size < payload_size: buffer = self.client.receive() logging.debug("Received payload length: %s", len(buffer)) if len(buffer) == 0: logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size) break cur_payload_size += len(buffer) logging.debug("Processed payload: %r", cur_payload_size) file.write(buffer) def __get_content_length(self): content_length = self.headers.get("content-length") if not content_length: return None return int(content_length) class HTMLResponseHandler(PlainResponseHandler): def __init__(self, client: HTTPClient, headers, status_code, url): super().__init__(client, headers, status_code, url) def handle(self, buffer: bytes): payload_size = self._get_payload_size() if payload_size is None: return logging.debug("Retrieving payload") filename = self.get_filename() tmp_filename = "." + filename + ".tmp" file = open(tmp_filename, "wb") self._retrieve(file, buffer, payload_size) file.close() self.__download_images(tmp_filename, filename) os.remove(tmp_filename) return filename def __download_images(self, tmp_filename, target_filename): (host, path) = parse_uri(self.url) with open(tmp_filename, "r") as fp: soup = BeautifulSoup(fp, "lxml") for tag in soup.find_all("img"): try: tag["src"] = self.__download_image(tag["src"], host, path) except Exception as e: logging.error("Failed to download image, skipping...", exc_info=e) with open(target_filename, 'w') as file: file.write(str(soup)) def __download_image(self, img_src, host, path): parsed = urlparse(img_src) same_host = True if len(parsed.netloc) == 0 or parsed.netloc == host: img_host = host if parsed.path[0] != "/": base = os.path.split(path)[0] if base[-1] != '/': base += "/" img_path = base + parsed.path else: img_path = parsed.path else: same_host = False (img_host, img_path) = parse_uri(img_src) message = "GET {path} HTTP/1.1\r\n".format(path=img_path) message += "Accept: */*\r\nAccept-Encoding: identity\r\n" message += "Host: {host}\r\n\r\n".format(host=host) message = message.encode(FORMAT) if same_host: client = self.client else: client = HTTPClient(img_src) client.connect((img_host, 80)) client.sendall(message) filename = self._handle_download(client, img_host + img_path) if not same_host: client.close() return filename class ChunkedResponseHandler(ResponseHandler): def __init__(self, client: HTTPClient, headers, status_code, url): super().__init__(client, headers, status_code, url) def handle(self, buffer: bytes): return None