diff --git a/client.py b/client.py index 3f87206..1c39160 100644 --- a/client.py +++ b/client.py @@ -196,6 +196,11 @@ def parse_uri(uri: str): path = parsed.path if len(path) == 0 or path[0] != '/': path = "/" + path + + port_pos = host.find(":") + if port_pos >= 0: + host = host[:port_pos] + return host, path @@ -213,7 +218,7 @@ def main(): (host, path) = parse_uri(arguments.URI) client = HTTPClient(host) - client.connect((host, arguments.port)) + client.connect((host, int(arguments.port))) message = "GET {path} HTTP/1.1\r\n".format(path=path) message += "Accept: */*\r\nAccept-Encoding: identity\r\n" diff --git a/client/ResponseHandler.py b/client/ResponseHandler.py index 394fb0a..0292ea5 100644 --- a/client/ResponseHandler.py +++ b/client/ResponseHandler.py @@ -1,46 +1,110 @@ import logging import os +import re from typing import Dict from urllib.parse import urlparse from bs4 import BeautifulSoup +from client.Retriever import Retriever from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine def handle(client: HTTPClient, url: str): logging.debug("Waiting for response") - try: - buffer = client.receive() - except TimeoutError: - print("[ABRT] Response timed out") - return try: - (header_chunk, buffer) = client.get_crlf_chunk(buffer) - (status_line, headers) = client.parse_headers(header_chunk) - client.validate_status_line(status_line) - - status_code = int(status_line.split(" ")[1]) - - response_handler = construct(client, headers, status_code, url) - response_handler.handle(buffer) + (version, status, _) = get_status_line(client) + logging.debug("Parsed status-line: version: %s, status: %s", version, status) + headers = get_headers(client) + logging.debug("Parsed headers: %r", headers) + response_handler = construct(client, headers, status, url) + response_handler.handle() except InvalidResponse as e: logging.debug("Internal error: Response could not be parsed", exc_info=e) - print("[ABRT] Invalid response") return except InvalidStatusLine as e: logging.debug("Internal error: Invalid status-line in response", exc_info=e) - print("[ABRT] Invalid response") return except UnsupportedEncoding as e: logging.debug("Internal error: Unsupported encoding in response", exc_info=e) - print("[ABRT] Invalid response") return +def get_status_line(client: HTTPClient): + line = client.read_line() + + split = list(filter(None, line.split(" "))) + if len(split) < 3: + raise InvalidStatusLine(line) + + # Check HTTP version + http_version = split.pop(0) + if len(http_version) < 8 or http_version[4] != "/": + raise InvalidStatusLine(line) + + (name, version) = http_version[:4], http_version[5:] + if name != "HTTP" or not re.match(r"1\.[0|1]", version): + raise InvalidStatusLine(line) + + status = split.pop(0) + if not re.match(r"\d{3}", status): + raise InvalidStatusLine(line) + status = int(status) + if status < 100 or status > 999: + raise InvalidStatusLine(line) + + reason = split.pop(0) + return version, status, reason + + +def get_headers(client: HTTPClient): + headers = [] + # first header after the status-line may not contain a space + while True: + line = client.read_line() + if line[0].isspace(): + continue + else: + break + + while True: + if line in ("\r\n", "\n", " "): + break + + if line[0].isspace(): + headers[-1] = headers[-1].rstrip("\r\n") + + headers.append(line.lstrip()) + line = client.read_line() + + result = {} + header_str = "".join(headers) + for line in header_str.splitlines(): + pos = line.find(":") + + if pos <= 0 or pos >= len(line) - 1: + continue + + (header, value) = map(str.strip, line.split(":", 1)) + check_next_header(result, header, value) + result[header.lower()] = value.lower() + + return result + + +def check_next_header(headers, next_header: str, next_value: str): + if next_header == "content-length": + if "content-length" in headers: + logging.error("Multiple content-length headers specified") + raise InvalidResponse() + if not next_value.isnumeric() or int(next_value) <= 0: + logging.error("Invalid content-length value: %r", next_value) + raise InvalidResponse() + + def construct(client: HTTPClient, headers, status_code, url): # only chunked transfer-encoding is supported transfer_encoding = headers.get("transfer-encoding") @@ -53,13 +117,12 @@ def construct(client: HTTPClient, headers, status_code, url): if content_encoding: raise UnsupportedEncoding("content-encoding", content_encoding) - if chunked: - return ChunkedResponseHandler(client, headers, status_code, url) - else: - content_type = headers.get("content-type") - if content_type and "text/html" in content_type: - return HTMLResponseHandler(client, headers, status_code, url) - return PlainResponseHandler(client, headers, status_code, url) + retriever = Retriever.create(client, headers) + + content_type = headers.get("content-type") + if content_type and "text/html" in content_type: + return HTMLDownloadHandler(retriever, client, headers, url) + return RawDownloadHandler(retriever, client, headers, url) def parse_uri(uri: str): @@ -81,17 +144,54 @@ class ResponseHandler: headers: Dict[str, str] status_code: int url: str + retriever: Retriever - def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str): + def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str): self.client = client self.headers = headers - self.status_code = status_code self.url = url + self.retriever = retriever pass - def handle(self, buffer: bytes): + def handle(self): pass + +class DownloadHandler(ResponseHandler): + path: str + + def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): + super().__init__(retriever, client, headers, url) + + if not dir: + dir = self._create_directory() + + self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename())) + + @staticmethod + def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): + content_type = headers.get("content-type") + if content_type and "text/html" in content_type: + return HTMLDownloadHandler(retriever, client, headers, url, dir) + return RawDownloadHandler(retriever, client, headers, url, dir) + + def handle(self) -> str: + pass + + def _create_directory(self): + path = self._get_duplicate_name(os.path.abspath(self.client.host)) + os.mkdir(path) + return path + + def _get_duplicate_name(self, path): + tmp_path = path + i = 0 + while os.path.exists(tmp_path): + i += 1 + tmp_path = "{path}.{counter}".format(path=path, counter=i) + + return tmp_path + def get_filename(self): """Returns the filename to download the payload to. """ @@ -118,131 +218,68 @@ class ResponseHandler: return "index.html" - def _handle_download(self, client, url): - logging.debug("Waiting for response") - try: - buffer = client.receive() - except TimeoutError: - print("[ABRT] Response timed out") - return + def _handle_sub_request(self, client, url): - try: - (header_chunk, buffer) = client.get_crlf_chunk(buffer) - (status_line, headers) = client.parse_headers(header_chunk) - client.validate_status_line(status_line) + (version, status, _) = get_status_line(client) + logging.debug("Parsed status-line: version: %s, status: %s", version, status) + headers = get_headers(client) + logging.debug("Parsed headers: %r", headers) - status_code = int(status_line.split(" ")[1]) - if status_code != 200: - raise InvalidResponse("Code not 200") + if status != 200: + raise InvalidResponse("Status not expected 200: " + str(status)) - response_handler = construct(client, headers, status_code, url) - filename = response_handler.handle(buffer) + retriever = Retriever.create(client, headers) + handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path)) - return filename + return handler.handle() - except InvalidResponse as e: - logging.debug("Internal error: Response could not be parsed", exc_info=e) - print("[ABRT] Invalid response") - return - except InvalidStatusLine as e: - logging.debug("Internal error: Invalid status-line in response", exc_info=e) - print("[ABRT] Invalid response") - return - except UnsupportedEncoding as e: - logging.debug("Internal error: Unsupported encoding in response", exc_info=e) - print("[ABRT] Invalid response") - return +class RawDownloadHandler(DownloadHandler): + def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): + super().__init__(retriever, client, headers, url, dir) -class PlainResponseHandler(ResponseHandler): - def __init__(self, client: HTTPClient, headers, status_code, url): - super().__init__(client, headers, status_code, url) - - def _get_payload_size(self): - content_length = self.__get_content_length() - if content_length == 0: - logging.debug("content-length is 0") - return None - - payload_size = content_length - if not content_length: - payload_size = -1 - logging.debug("No content-length specified") - else: - logging.debug("Expected content-length=%s", payload_size) - - return payload_size - - def handle(self, buffer: bytes): - payload_size = self._get_payload_size() - if payload_size is None: - return - + def handle(self) -> str: logging.debug("Retrieving payload") - filename = self.get_filename() - file = open(filename, "wb") - self._retrieve(file, buffer, payload_size) - file.close() + file = open(self.path, "wb") - return filename - - def _retrieve(self, file, buffer: bytes, payload_size: int): - - file.write(buffer) - - cur_payload_size = len(buffer) - while cur_payload_size < payload_size: - buffer = self.client.receive() - logging.debug("Received payload length: %s", len(buffer)) - - if len(buffer) == 0: - logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size) - break - - cur_payload_size += len(buffer) - logging.debug("Processed payload: %r", cur_payload_size) + for buffer in self.retriever.retrieve(): file.write(buffer) - - def __get_content_length(self): - content_length = self.headers.get("content-length") - if not content_length: - return None - - return int(content_length) - - -class HTMLResponseHandler(PlainResponseHandler): - def __init__(self, client: HTTPClient, headers, status_code, url): - super().__init__(client, headers, status_code, url) - - def handle(self, buffer: bytes): - payload_size = self._get_payload_size() - if payload_size is None: - return - - logging.debug("Retrieving payload") - filename = self.get_filename() - tmp_filename = "." + filename + ".tmp" - file = open(tmp_filename, "wb") - self._retrieve(file, buffer, payload_size) file.close() - self.__download_images(tmp_filename, filename) - os.remove(tmp_filename) - return filename + return self.path + + +class HTMLDownloadHandler(DownloadHandler): + def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): + super().__init__(retriever, client, headers, url, dir) + + def handle(self) -> str: + + (dir, file) = os.path.split(self.path) + tmp_filename = ".{file}.tmp".format(file=file) + tmp_path = os.path.join(dir, tmp_filename) + file = open(tmp_path, "wb") + + for buffer in self.retriever.retrieve(): + file.write(buffer) + file.close() + + self.__download_images(tmp_path, self.path) + os.remove(tmp_path) + return self.path def __download_images(self, tmp_filename, target_filename): (host, path) = parse_uri(self.url) - with open(tmp_filename, "r") as fp: - soup = BeautifulSoup(fp, "lxml") + with open(tmp_filename, "rb") as fp: + soup = BeautifulSoup(fp, 'html.parser') for tag in soup.find_all("img"): try: tag["src"] = self.__download_image(tag["src"], host, path) except Exception as e: - logging.error("Failed to download image, skipping...", exc_info=e) + logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e) with open(target_filename, 'w') as file: file.write(str(soup)) @@ -250,6 +287,8 @@ class HTMLResponseHandler(PlainResponseHandler): def __download_image(self, img_src, host, path): parsed = urlparse(img_src) + logging.debug("Downloading image: %s", img_src) + same_host = True if len(parsed.netloc) == 0 or parsed.netloc == host: img_host = host @@ -271,21 +310,14 @@ class HTMLResponseHandler(PlainResponseHandler): if same_host: client = self.client + client.reset_request() else: client = HTTPClient(img_src) client.connect((img_host, 80)) client.sendall(message) - filename = self._handle_download(client, img_host + img_path) + filename = self._handle_sub_request(client, img_host + img_path) if not same_host: client.close() return filename - - -class ChunkedResponseHandler(ResponseHandler): - def __init__(self, client: HTTPClient, headers, status_code, url): - super().__init__(client, headers, status_code, url) - - def handle(self, buffer: bytes): - return None diff --git a/client/Retriever.py b/client/Retriever.py new file mode 100644 index 0000000..6971e49 --- /dev/null +++ b/client/Retriever.py @@ -0,0 +1,120 @@ +import logging +from typing import Dict + +from client.httpclient import HTTPClient, BUFSIZE, IncompleteResponse, InvalidResponse, UnsupportedEncoding + + +class Retriever: + client: HTTPClient + headers: Dict[str, str] + + def __init__(self, client: HTTPClient): + self.client = client + + def retrieve(self): + pass + + @staticmethod + def create(client: HTTPClient, headers: Dict[str, str]): + + # only chunked transfer-encoding is supported + transfer_encoding = headers.get("transfer-encoding") + if transfer_encoding and transfer_encoding != "chunked": + raise UnsupportedEncoding("transfer-encoding", transfer_encoding) + chunked = transfer_encoding + + # content-encoding is not supported + content_encoding = headers.get("content-encoding") + if content_encoding: + raise UnsupportedEncoding("content-encoding", content_encoding) + + if chunked: + return ChunkedRetriever(client) + else: + content_length = headers.get("content-length") + + if not content_length: + logging.warning("Transfer-encoding and content-length not specified, trying without") + return RawRetriever(client) + + return ContentLengthRetriever(client, int(content_length)) + + +class ContentLengthRetriever(Retriever): + length: int + + def __init__(self, client: HTTPClient, length: int): + super().__init__(client) + self.length = length + + def retrieve(self): + + cur_payload_size = 0 + read_size = BUFSIZE + while cur_payload_size < self.length: + + remaining = self.length - cur_payload_size + if remaining < read_size: + read_size = remaining + + try: + buffer = self.client.read(remaining) + except TimeoutError: + logging.error("Timed out before receiving complete payload") + self.client.close() + raise IncompleteResponse("Timed out before receiving complete payload") + except ConnectionError: + logging.error("Timed out before receiving complete payload") + self.client.close() + raise IncompleteResponse("Connection closed before receiving complete payload") + + logging.debug("Received payload length: %s", len(buffer)) + + if len(buffer) == 0: + logging.warning("Received payload length %s less than expected %s", cur_payload_size, self.length) + break + + cur_payload_size += len(buffer) + logging.debug("Processed payload: %r", cur_payload_size) + yield buffer + + return b"" + + +class RawRetriever(Retriever): + + def retrieve(self): + while True: + try: + yield self.client.read() + except TimeoutError or ConnectionError: + return b"" + + +class ChunkedRetriever(Retriever): + + def retrieve(self): + while True: + chunk_size = self._get_chunk_size() + logging.debug("chunk-size: %s", chunk_size) + if chunk_size == 0: + self.client.reset_request() + break + + buffer = self.client.read(chunk_size) + logging.debug("chunk: %r", buffer) + yield buffer + + self.client.read_line() # remove CRLF + return b"" + + def _get_chunk_size(self): + line = self.client.read_line() + sep_pos = line.find(";") + if sep_pos >= 0: + line = line[:sep_pos] + + try: + return int(line, 16) + except ValueError: + raise InvalidResponse() diff --git a/client/httpclient.py b/client/httpclient.py index a130a93..c2b5c17 100644 --- a/client/httpclient.py +++ b/client/httpclient.py @@ -1,21 +1,35 @@ import logging import re import socket -from typing import Dict +from io import BufferedReader +from typing import TextIO, IO BUFSIZE = 4096 TIMEOUT = 3 FORMAT = "UTF-8" +MAXLINE = 4096 class HTTPClient(socket.socket): host: str + file: BufferedReader def __init__(self, host: str): super().__init__(socket.AF_INET, socket.SOCK_STREAM) self.settimeout(TIMEOUT) self.host = host + self.setblocking(True) + self.settimeout(3.0) + self.file = self.makefile("rb") + + def close(self): + self.file.close() + super().close() + + def reset_request(self): + self.file.close() + self.file = self.makefile("rb") def _do_receive(self): if self.fileno() == -1: @@ -41,6 +55,26 @@ class HTTPClient(socket.socket): logging.debug("Timed out after waiting %s seconds for response", TIMEOUT * count) raise TimeoutError("Request timed out") + def read(self, size=BUFSIZE, blocking=True) -> bytes: + if blocking: + return self.file.read(size) + + return self.file.read1(size) + + def read_line(self): + return str(self.read_bytes_line(), FORMAT) + + def read_bytes_line(self): + """ + + :rtype: bytes + """ + line = self.file.readline(MAXLINE + 1) + if len(line) > MAXLINE: + raise InvalidResponse("Line too long") + + return line + def validate_status_line(self, status_line: str): split = list(filter(None, status_line.split(" "))) if len(split) < 3: @@ -129,3 +163,7 @@ class UnsupportedEncoding(HTTPException): def __init(self, enc_type, encoding): self.enc_type = enc_type self.encoding = encoding + +class IncompleteResponse(HTTPException): + def __init(self, cause): + self.cause = cause \ No newline at end of file diff --git a/server.py b/server.py index 8bb3e2a..0d5c7ce 100644 --- a/server.py +++ b/server.py @@ -3,6 +3,7 @@ import socket # socket heeft een listening and accept method +import time SERVER = "127.0.0.1" #dynamisch fixen in project PORT = 5055 @@ -26,8 +27,11 @@ def start(): while connected: # while client is connected, we want to recieve messages msg = conn.recv(HEADER).decode(FORMAT).rstrip() # Argument is maximum size of msg (in project look into details of accp), decode is for converting bytes to strings, rstrip is for stripping messages for special hidden characters print("message: ", msg) - if msg == DISCONNECT_MESSAGE: - connected = False + for i in range(0,10): + conn.send(b"test") + time.sleep(1) + + break print("close connection ", addr[0], " disconnected.") conn.close()