diff --git a/client.py b/client.py index 3cd9cf1..3f87206 100644 --- a/client.py +++ b/client.py @@ -1,13 +1,14 @@ #!/usr/bin/env python3 import argparse import logging -import sys -import socket import re +import socket +import sys import time -import os +from urllib.parse import urlparse -from client.ResponseHandler import ResponseHandler +from client import ResponseHandler +from client.httpclient import HTTPClient FORMAT = 'utf-8' BUFSIZE = 4096 @@ -125,22 +126,7 @@ def get_chunk(buffer: bytes): return buffer[:split_start], buffer[split_end:] -def get_html_filename(headers): - if "CONTENT-LOCATION" not in headers: - return "index.html" - - filename = headers["CONTENT-LOCATION"] - result = os.path.basename(filename).strip() - - if len(result.strip()) == 0: - return 'index.html' - - return result - - def response_parser(client: socket.socket): - client.settimeout(3.0) - try: buffer = client.recv(BUFSIZE) except TimeoutError as err: @@ -165,7 +151,7 @@ def response_parser(client: socket.socket): if payload_size == 0: return - filename = get_html_filename(headers) + filename = util.get_html_filename(headers) f = open(filename, "wb") f.write(buffer) @@ -199,6 +185,20 @@ def http_parser(client: socket.socket): logging.debug("chunk: %r", chunk) +def parse_uri(uri: str): + parsed = urlparse(uri) + + # If there is no netloc, the url is invalid, so prepend `//` and try again + if parsed.netloc == "": + parsed = urlparse("//" + uri) + + host = parsed.netloc + path = parsed.path + if len(path) == 0 or path[0] != '/': + path = "/" + path + return host, path + + def main(): parser = argparse.ArgumentParser(description='HTTP Client') parser.add_argument("--verbose", "-v", action='count', default=0, help="Increase verbosity level of logging") @@ -211,13 +211,19 @@ def main(): logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose)) logging.debug("Arguments: %s", arguments) - client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - client.connect((arguments.URI, arguments.port)) + (host, path) = parse_uri(arguments.URI) + client = HTTPClient(host) + client.connect((host, arguments.port)) - message = "GET /Protocols/HTTP/Performance/microscape/ HTTP/1.1\r\nHost: www.w3.org:80\r\n\r\n".encode(FORMAT) + message = "GET {path} HTTP/1.1\r\n".format(path=path) + message += "Accept: */*\r\nAccept-Encoding: identity\r\n" + message += "Host: {host}\r\n\r\n".format(host=host) + + message = message.encode(FORMAT) + logging.debug("Sending HTTP message: %r", message) client.sendall(message) - - response_parser(client) + ResponseHandler.handle(client, arguments.URI) + # response_parser(client) # http_parser(client) # tmp = b'' # keep = False diff --git a/client/ResponseHandler.py b/client/ResponseHandler.py index f690a09..394fb0a 100644 --- a/client/ResponseHandler.py +++ b/client/ResponseHandler.py @@ -1,26 +1,110 @@ +import logging import os -from socket import socket from typing import Dict from urllib.parse import urlparse +from bs4 import BeautifulSoup + +from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine + + +def handle(client: HTTPClient, url: str): + logging.debug("Waiting for response") + try: + buffer = client.receive() + except TimeoutError: + print("[ABRT] Response timed out") + return + + try: + (header_chunk, buffer) = client.get_crlf_chunk(buffer) + (status_line, headers) = client.parse_headers(header_chunk) + client.validate_status_line(status_line) + + status_code = int(status_line.split(" ")[1]) + + response_handler = construct(client, headers, status_code, url) + response_handler.handle(buffer) + + + except InvalidResponse as e: + logging.debug("Internal error: Response could not be parsed", exc_info=e) + print("[ABRT] Invalid response") + return + except InvalidStatusLine as e: + logging.debug("Internal error: Invalid status-line in response", exc_info=e) + print("[ABRT] Invalid response") + return + except UnsupportedEncoding as e: + logging.debug("Internal error: Unsupported encoding in response", exc_info=e) + print("[ABRT] Invalid response") + return + + +def construct(client: HTTPClient, headers, status_code, url): + # only chunked transfer-encoding is supported + transfer_encoding = headers.get("transfer-encoding") + if transfer_encoding and transfer_encoding != "chunked": + raise UnsupportedEncoding("transfer-encoding", transfer_encoding) + chunked = transfer_encoding + + # content-encoding is not supported + content_encoding = headers.get("content-encoding") + if content_encoding: + raise UnsupportedEncoding("content-encoding", content_encoding) + + if chunked: + return ChunkedResponseHandler(client, headers, status_code, url) + else: + content_type = headers.get("content-type") + if content_type and "text/html" in content_type: + return HTMLResponseHandler(client, headers, status_code, url) + return PlainResponseHandler(client, headers, status_code, url) + + +def parse_uri(uri: str): + parsed = urlparse(uri) + + # If there is no netloc, the url is invalid, so prepend `//` and try again + if parsed.netloc == "": + parsed = urlparse("//" + uri) + + host = parsed.netloc + path = parsed.path + if len(path) == 0 or path[0] != '/': + path = "/" + path + return host, path + class ResponseHandler: - client: socket - url: str + client: HTTPClient headers: Dict[str, str] + status_code: int + url: str - def __init__(self, url: str, client: socket): - self.headers = {} - self.url = url + def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str): self.client = client + self.headers = headers + self.status_code = status_code + self.url = url pass - def get_html_filename(self): + def handle(self, buffer: bytes): + pass + + def get_filename(self): + """Returns the filename to download the payload to. + """ filename = "index.html" parsed = urlparse(self.url) + + # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": parsed = urlparse("//" + self.url) + + # If the path contains a `/` get only the last part and use it as filename + # If the path end with a `/`, it's a directory so ignore it. if len(parsed.path) != 0: index = parsed.path.rfind("/") if index == -1: @@ -29,14 +113,179 @@ class ResponseHandler: filename = parsed.path[index:] result = os.path.basename(filename).strip() - return result + if any(letter.isalnum() for letter in result): + return result + + return "index.html" + + def _handle_download(self, client, url): + logging.debug("Waiting for response") + try: + buffer = client.receive() + except TimeoutError: + print("[ABRT] Response timed out") + return + + try: + (header_chunk, buffer) = client.get_crlf_chunk(buffer) + (status_line, headers) = client.parse_headers(header_chunk) + client.validate_status_line(status_line) + + status_code = int(status_line.split(" ")[1]) + if status_code != 200: + raise InvalidResponse("Code not 200") + + response_handler = construct(client, headers, status_code, url) + filename = response_handler.handle(buffer) + + return filename + + + except InvalidResponse as e: + logging.debug("Internal error: Response could not be parsed", exc_info=e) + print("[ABRT] Invalid response") + return + except InvalidStatusLine as e: + logging.debug("Internal error: Invalid status-line in response", exc_info=e) + print("[ABRT] Invalid response") + return + except UnsupportedEncoding as e: + logging.debug("Internal error: Unsupported encoding in response", exc_info=e) + print("[ABRT] Invalid response") + return class PlainResponseHandler(ResponseHandler): - def __init__(self, url: str, client: socket): - super().__init__(url, client) + def __init__(self, client: HTTPClient, headers, status_code, url): + super().__init__(client, headers, status_code, url) + + def _get_payload_size(self): + content_length = self.__get_content_length() + if content_length == 0: + logging.debug("content-length is 0") + return None + + payload_size = content_length + if not content_length: + payload_size = -1 + logging.debug("No content-length specified") + else: + logging.debug("Expected content-length=%s", payload_size) + + return payload_size + + def handle(self, buffer: bytes): + payload_size = self._get_payload_size() + if payload_size is None: + return + + logging.debug("Retrieving payload") + filename = self.get_filename() + file = open(filename, "wb") + self._retrieve(file, buffer, payload_size) + file.close() + + return filename + + def _retrieve(self, file, buffer: bytes, payload_size: int): + + file.write(buffer) + + cur_payload_size = len(buffer) + while cur_payload_size < payload_size: + buffer = self.client.receive() + logging.debug("Received payload length: %s", len(buffer)) + + if len(buffer) == 0: + logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size) + break + + cur_payload_size += len(buffer) + logging.debug("Processed payload: %r", cur_payload_size) + file.write(buffer) + + def __get_content_length(self): + content_length = self.headers.get("content-length") + if not content_length: + return None + + return int(content_length) + + +class HTMLResponseHandler(PlainResponseHandler): + def __init__(self, client: HTTPClient, headers, status_code, url): + super().__init__(client, headers, status_code, url) + + def handle(self, buffer: bytes): + payload_size = self._get_payload_size() + if payload_size is None: + return + + logging.debug("Retrieving payload") + filename = self.get_filename() + tmp_filename = "." + filename + ".tmp" + file = open(tmp_filename, "wb") + self._retrieve(file, buffer, payload_size) + file.close() + + self.__download_images(tmp_filename, filename) + os.remove(tmp_filename) + return filename + + def __download_images(self, tmp_filename, target_filename): + + (host, path) = parse_uri(self.url) + with open(tmp_filename, "r") as fp: + soup = BeautifulSoup(fp, "lxml") + + for tag in soup.find_all("img"): + try: + tag["src"] = self.__download_image(tag["src"], host, path) + except Exception as e: + logging.error("Failed to download image, skipping...", exc_info=e) + + with open(target_filename, 'w') as file: + file.write(str(soup)) + + def __download_image(self, img_src, host, path): + parsed = urlparse(img_src) + + same_host = True + if len(parsed.netloc) == 0 or parsed.netloc == host: + img_host = host + if parsed.path[0] != "/": + base = os.path.split(path)[0] + if base[-1] != '/': + base += "/" + img_path = base + parsed.path + else: + img_path = parsed.path + else: + same_host = False + (img_host, img_path) = parse_uri(img_src) + + message = "GET {path} HTTP/1.1\r\n".format(path=img_path) + message += "Accept: */*\r\nAccept-Encoding: identity\r\n" + message += "Host: {host}\r\n\r\n".format(host=host) + message = message.encode(FORMAT) + + if same_host: + client = self.client + else: + client = HTTPClient(img_src) + client.connect((img_host, 80)) + client.sendall(message) + filename = self._handle_download(client, img_host + img_path) + + if not same_host: + client.close() + + return filename class ChunkedResponseHandler(ResponseHandler): - def __init__(self, url: str, client: socket): - super().__init__(url, client) + def __init__(self, client: HTTPClient, headers, status_code, url): + super().__init__(client, headers, status_code, url) + + def handle(self, buffer: bytes): + return None diff --git a/client/httpclient.py b/client/httpclient.py new file mode 100644 index 0000000..a130a93 --- /dev/null +++ b/client/httpclient.py @@ -0,0 +1,131 @@ +import logging +import re +import socket +from typing import Dict + +BUFSIZE = 4096 +TIMEOUT = 3 +FORMAT = "UTF-8" + + +class HTTPClient(socket.socket): + host: str + + def __init__(self, host: str): + + super().__init__(socket.AF_INET, socket.SOCK_STREAM) + self.settimeout(TIMEOUT) + self.host = host + + def _do_receive(self): + if self.fileno() == -1: + raise Exception("Connection closed") + + result = self.recv(BUFSIZE) + return result + + def receive(self): + """Receive data from the client up to BUFSIZE + """ + count = 0 + while True: + count += 1 + try: + return self._do_receive() + except socket.timeout: + logging.debug("Socket receive timed out after %s seconds", TIMEOUT) + if count == 3: + break + logging.debug("Retrying %s", count) + + logging.debug("Timed out after waiting %s seconds for response", TIMEOUT * count) + raise TimeoutError("Request timed out") + + def validate_status_line(self, status_line: str): + split = list(filter(None, status_line.split(" "))) + if len(split) < 3: + return False + + # Check HTTP version + http_version = split.pop(0) + if len(http_version) < 8 or http_version[4] != "/": + raise InvalidStatusLine(status_line) + (name, version) = http_version[:4], http_version[5:] + if name != "HTTP" or not re.match(r"1\.[0|1]", version): + return False + + if not re.match(r"\d{3}", split[0]): + return False + + return True + + def get_crlf_chunk(self, buffer: bytes): + """Finds the line break type (`CRLF` or `LF`) and splits the specified buffer + when encountering 2 consecutive linebreaks. + Returns a tuple with the first part and the remaining of the buffer. + + :param buffer: + :return: + """ + lf_pos = buffer.find(b"\n\n") + crlf_pos = buffer.find(b"\r\n\r\n") + if lf_pos != -1 and lf_pos < crlf_pos: + split_start = lf_pos + split_end = lf_pos + 2 + else: + split_start = crlf_pos + split_end = crlf_pos + 4 + + return buffer[:split_start], buffer[split_end:] + + def parse_headers(self, data: bytes): + headers = {} + + # decode bytes, split into lines and filter + header_split = list( + filter(lambda l: l is not "" and not l[0].isspace(), map(str.strip, data.decode("utf-8").split("\n")))) + + if len(header_split) == 0: + raise InvalidResponse(data) + + start_line = header_split.pop(0) + logging.debug("start-line: %r", start_line) + + for line in header_split: + pos = line.find(":") + + if pos <= 0 or pos >= len(line) - 1: + continue + + (header, value) = map(str.strip, line.split(":", 1)) + headers[header.lower()] = value.lower() + + logging.debug("Parsed headers: %r", headers) + + return start_line, headers + + +class HTTPException(Exception): + """ Base class for HTTP exceptions """ + + +class InvalidResponse(HTTPException): + """ Response message cannot be parsed """ + + def __init(self, message): + self.message = message + + +class InvalidStatusLine(HTTPException): + """ Response status line is invalid """ + + def __init(self, line): + self.line = line + + +class UnsupportedEncoding(HTTPException): + """ Reponse Encoding not support """ + + def __init(self, enc_type, encoding): + self.enc_type = enc_type + self.encoding = encoding diff --git a/requirements.txt b/requirements.txt index e69de29..ee953f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,2 @@ +beautifulsoup4~=4.9.3 +lxml==4.6.2 \ No newline at end of file