From d14252f707e8b43119d13f186535c91f4d3a6529 Mon Sep 17 00:00:00 2001 From: Arthur Bols Date: Wed, 24 Mar 2021 16:35:12 +0100 Subject: [PATCH] Update --- client.py | 4 +- client/command.py | 102 +++++++++----- client/htmlparser.py | 6 + client/response_handler.py | 266 ++++++++++++++++++------------------- httplib/MessageParser.py | 0 httplib/httpsocket.py | 7 +- httplib/message.py | 16 +++ httplib/parser.py | 78 ++++++++++- httplib/retriever.py | 28 +++- server/worker.py | 3 +- 10 files changed, 325 insertions(+), 185 deletions(-) create mode 100644 client/htmlparser.py delete mode 100644 httplib/MessageParser.py create mode 100644 httplib/message.py diff --git a/client.py b/client.py index 988a4d0..86338ac 100644 --- a/client.py +++ b/client.py @@ -3,7 +3,7 @@ import argparse import logging import sys -from client.command import AbstractCommand +from client import command as cmd def main(): @@ -18,7 +18,7 @@ def main(): logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose)) logging.debug("Arguments: %s", arguments) - command = AbstractCommand.create(arguments.command, arguments.URI, arguments.port) + command = cmd.create(arguments.command, arguments.URI, arguments.port) command.execute() diff --git a/client/command.py b/client/command.py index bce53d8..817ad1b 100644 --- a/client/command.py +++ b/client/command.py @@ -1,16 +1,39 @@ import logging from abc import ABC, abstractmethod +from typing import Dict, Tuple from urllib.parse import urlparse -from client.response_handler import ResponseHandler from client.httpclient import FORMAT, HTTPClient from httplib import parser from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding +from httplib.message import Message +from httplib.retriever import PreambleRetriever + +sockets: Dict[str, HTTPClient] = {} + + +def create(command: str, url: str, port): + if command == "GET": + return GetCommand(url, port) + elif command == "HEAD": + return HeadCommand(url, port) + elif command == "POST": + return PostCommand(url, port) + elif command == "PUT": + return PutCommand(url, port) + else: + raise ValueError() + class AbstractCommand(ABC): + uri: str + host: str + path: str + port: Tuple[str, int] - def __init__(self, url: str, port: str): - self.url = url + def __init__(self, uri: str, port): + self.uri = uri + self.host, _, self.path = parser.parse_uri(uri) self.port = port @property @@ -18,20 +41,6 @@ class AbstractCommand(ABC): def command(self): pass - @staticmethod - def create(command: str, url: str, port: str): - if command == "GET": - return GetCommand(url, port) - elif command == "HEAD": - return HeadCommand(url, port) - elif command == "POST": - return PostCommand(url, port) - elif command == "PUT": - return PutCommand(url, port) - else: - raise ValueError() - - @staticmethod def build_message(command, host, path): message = f"{command} {path} HTTP/1.1\r\n" @@ -40,26 +49,34 @@ class AbstractCommand(ABC): return message.encode(FORMAT) - def execute(self): + def execute(self, sub_request=False): (host, path) = self.parse_uri() - client = HTTPClient(host) - client.conn.connect((host, int(self.port))) + client = sockets.get(host) + + if client and client.is_closed(): + sockets.pop(self.host) + client = None + + if not client: + client = HTTPClient(host) + client.conn.connect((host, self.port)) + sockets[host] = client message = f"{self.command} {path} HTTP/1.1\r\n" - message += f"Host: {host}\r\n" + message += f"Host: {host}:{self.port}\r\n" message += "Accept: */*\r\nAccept-Encoding: identity\r\n" encoded_msg = self._build_message(message) - logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT)) + logging.debug("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT)) - logging.debug("Sending HTTP message: %r", encoded_msg) client.conn.sendall(encoded_msg) logging.info("HTTP request sent, awaiting response...") try: - self._await_response(client) + retriever = PreambleRetriever(client) + self._await_response(client, retriever) except InvalidResponse as e: logging.debug("Internal error: Response could not be parsed", exc_info=e) return @@ -69,9 +86,10 @@ class AbstractCommand(ABC): except UnsupportedEncoding as e: logging.debug("Internal error: Unsupported encoding in response", exc_info=e) finally: - client.close() + if not sub_request: + client.close() - def _await_response(self, client): + def _await_response(self, client, retriever): while True: line = client.read_line() print(line, end="") @@ -82,11 +100,11 @@ class AbstractCommand(ABC): return (message + "\r\n").encode(FORMAT) def parse_uri(self): - parsed = urlparse(self.url) + parsed = urlparse(self.uri) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": - parsed = urlparse("//" + self.url) + parsed = urlparse("//" + self.uri) host = parsed.netloc path = parsed.path @@ -105,6 +123,7 @@ class AbstractWithBodyCommand(AbstractCommand, ABC): @staticmethod def build_message(command, host, path): message = AbstractCommand.build_message() + def _build_message(self, message: str) -> bytes: body = input(f"Enter {self.command} data: ").encode(FORMAT) print() @@ -126,18 +145,31 @@ class HeadCommand(AbstractCommand): class GetCommand(AbstractCommand): + + def __init__(self, uri: str, port, dir=None): + super().__init__(uri, port) + self.dir = dir + self.filename = None + @property def command(self): return "GET" - def _await_response(self, client): - (version, status, msg) = parser.get_status_line(client) - logging.debug("Parsed status-line: version: %s, status: %s", version, status) - headers = parser.get_headers(client) - logging.debug("Parsed headers: %r", headers) + def _get_preamble(self, retriever): + lines = retriever.retrieve() + (version, status, msg) = parser.parse_status_line(next(lines)) + headers = parser.parse_headers(lines) - handler = ResponseHandler.create(client, headers, status, self.url) - handler.handle() + logging.debug("---response begin---\r\n%s--- response end---", "".join(retriever.buffer)) + + return Message(version, status, msg, headers) + + def _await_response(self, client, retriever) -> str: + msg = self._get_preamble(retriever) + + from client import response_handler + self.filename = response_handler.handle(client, msg, self, self.dir) + return class PostCommand(AbstractWithBodyCommand): diff --git a/client/htmlparser.py b/client/htmlparser.py new file mode 100644 index 0000000..ebd91da --- /dev/null +++ b/client/htmlparser.py @@ -0,0 +1,6 @@ +from bs4 import BeautifulSoup + + +class HTMLParser: + def __init__(self, soup: BeautifulSoup): + pass \ No newline at end of file diff --git a/client/response_handler.py b/client/response_handler.py index 1ab4247..43e3839 100644 --- a/client/response_handler.py +++ b/client/response_handler.py @@ -2,52 +2,57 @@ import logging import os import re from abc import ABC, abstractmethod -from typing import Dict -from urllib.parse import urlparse, unquote +from urllib.parse import urlsplit, unquote -import cssutils from bs4 import BeautifulSoup, Tag +from client.command import AbstractCommand, GetCommand from client.httpclient import HTTPClient, FORMAT from httplib import parser from httplib.exceptions import InvalidResponse +from httplib.message import Message from httplib.retriever import Retriever +def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None): + handler = BasicResponseHandler(client, msg, command) + retriever = handler.handle() + + if retriever is None: + return + + content_type = msg.headers.get("content-type") + if content_type and "text/html" in content_type: + handler = HTMLDownloadHandler(retriever, client, msg, command, dir) + else: + handler = RawDownloadHandler(retriever, client, msg, command, dir) + + return handler.handle() + + class ResponseHandler(ABC): client: HTTPClient - headers: Dict[str, str] - status_code: int - url: str retriever: Retriever + msg: Message + cmd: AbstractCommand - def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str): + def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd): self.client = client - self.headers = headers - self.url = url self.retriever = retriever - pass + self.msg = msg + self.cmd = cmd @abstractmethod def handle(self): pass - @staticmethod - def create(client: HTTPClient, headers, status_code, url): - retriever = Retriever.create(client, headers) - - content_type = headers.get("content-type") - if content_type and "text/html" in content_type: - return HTMLDownloadHandler(retriever, client, headers, url) - return RawDownloadHandler(retriever, client, headers, url) - @staticmethod def parse_uri(uri: str): - parsed = urlparse(uri) + parsed = urlsplit(uri) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": - parsed = urlparse("//" + uri) + parsed = urlsplit("//" + uri) host = parsed.netloc path = parsed.path @@ -56,11 +61,79 @@ class ResponseHandler(ABC): return host, path -class DownloadHandler(ResponseHandler, ABC): - path: str +class BasicResponseHandler(ResponseHandler): + """ Response handler which throws away the body and only shows the headers. + In case of a redirect, it will process it and pass it to the appropriate response handler. + """ - def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): - super().__init__(retriever, client, headers, url) + def __init__(self, client: HTTPClient, msg: Message, cmd: AbstractCommand): + retriever = Retriever.create(client, msg.headers) + super().__init__(retriever, client, msg, cmd) + + def handle(self): + return self._handle_status() + + def _skip_body(self): + logging.debug("Skipping body: [") + for line in self.retriever.retrieve(): + try: + logging.debug("%s", line.decode(FORMAT)) + except Exception: + logging.debug("%r", line) + + logging.debug("] done.") + + def _handle_status(self): + logging.info("%d %s", self.msg.status, self.msg.msg) + + if self.msg.status == 101: + # Switching protocols is not supported + print(f"{self.msg.version} {self.msg.status} {self.msg.msg}") + print(self.msg.headers) + return + + if 200 <= self.msg.status < 300: + return self.retriever + + if 300 <= self.msg.status < 400: + # Redirect + return self._do_handle_redirect() + if 400 <= self.msg.status < 500: + # Dump headers and exit with error + print(f"{self.msg.version} {self.msg.status} {self.msg.msg}") + print(self.msg.headers) + return None + + def _do_handle_redirect(self): + self._skip_body() + + location = self.msg.headers.get("location") + if not location: + raise InvalidResponse("No location in redirect") + + parsed_location = urlsplit(location) + if not parsed_location.hostname: + raise InvalidResponse("Invalid location") + + if not parsed_location.scheme == "http": + raise InvalidResponse("Only http is supported") + + self.cmd.uri = location + self.cmd.host, self.cmd.port, self.cmd.path = parser.parse_uri(location) + + if self.msg.status == 301: + logging.info("Status 301. Closing socket [%s]", self.cmd.host) + self.client.close() + + self.cmd.execute() + + return None + + +class DownloadHandler(ResponseHandler, ABC): + + def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None): + super().__init__(retriever, client, msg, cmd) if not dir: dir = self._create_directory() @@ -68,11 +141,11 @@ class DownloadHandler(ResponseHandler, ABC): self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename())) @staticmethod - def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): - content_type = headers.get("content-type") + def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None): + content_type = msg.headers.get("content-type") if content_type and "text/html" in content_type: - return HTMLDownloadHandler(retriever, client, headers, url, dir) - return RawDownloadHandler(retriever, client, headers, url, dir) + return HTMLDownloadHandler(retriever, client, msg, cmd, dir) + return RawDownloadHandler(retriever, client, msg, cmd, dir) def _create_directory(self): path = self._get_duplicate_name(os.path.abspath(self.client.host)) @@ -91,54 +164,25 @@ class DownloadHandler(ResponseHandler, ABC): def get_filename(self): """Returns the filename to download the payload to. """ - filename = "index.html" - - parsed = urlparse(self.url) - - # If there is no netloc, the url is invalid, so prepend `//` and try again - if parsed.netloc == "": - parsed = urlparse("//" + self.url) - - # If the path contains a `/` get only the last part and use it as filename - # If the path end with a `/`, it's a directory so ignore it. - if len(parsed.path) != 0: - index = parsed.path.rfind("/") - if index == -1: - filename = parsed.path - elif parsed.path[-1] != "/": - filename = parsed.path[index:] + filename = os.path.basename(self.cmd.path) + if filename == '': + return "index.html" while "%" in filename: filename = unquote(filename) filename = re.sub(r"[^\w.+-]+[.]*", '', filename) - result = os.path.basename(filename).strip() if any(letter.isalnum() for letter in result): return result return "index.html" - def _handle_sub_request(self, client, url): - - (version, status, _) = parser.get_status_line(client) - logging.debug("Parsed status-line: version: %s, status: %s", version, status) - headers = parser.get_headers(client) - logging.debug("Parsed headers: %r", headers) - - if status != 200: - raise InvalidResponse("Status not expected 200: " + str(status)) - - retriever = Retriever.create(client, headers) - handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path)) - - return handler.handle() - class RawDownloadHandler(DownloadHandler): - def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): - super().__init__(retriever, client, headers, url, dir) + def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None): + super().__init__(retriever, client, msg, cmd, dir) def handle(self) -> str: logging.debug("Retrieving payload") @@ -152,8 +196,8 @@ class RawDownloadHandler(DownloadHandler): class HTMLDownloadHandler(DownloadHandler): - def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None): - super().__init__(retriever, client, headers, url, dir) + def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None): + super().__init__(retriever, client, msg, cmd, dir) def handle(self) -> str: @@ -172,11 +216,11 @@ class HTMLDownloadHandler(DownloadHandler): def _download_images(self, tmp_filename, target_filename): - (host, path) = ResponseHandler.parse_uri(self.url) + (host, path) = ResponseHandler.parse_uri(self.cmd.uri) with open(tmp_filename, "rb") as fp: soup = BeautifulSoup(fp, 'lxml') - base_url = self.url + base_url = self.cmd.uri base_element = soup.find("base") if base_element: @@ -186,58 +230,24 @@ class HTMLDownloadHandler(DownloadHandler): tag: Tag for tag in soup.find_all("img"): try: - if tag.has_attr("src"): - el_name = "src" - elif tag.has_attr("data-src"): - el_name = "data-src" - else: + if not tag.has_attr("src"): continue - if tag[el_name] in processed: - new_url = processed.get(tag[el_name]) + if tag["src"] in processed: + new_url = processed.get(tag["src"]) else: - new_url = self.__download_image(tag[el_name], host, base_url) - processed[tag[el_name]] = new_url + new_url = self.__download_image(tag["src"], host, base_url) + processed[tag["src"]] = new_url if new_url: - tag[el_name] = new_url + tag["src"] = new_url except Exception as e: - logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e) - - for tag in soup.find_all("div"): - if not tag.has_attr("style"): - continue - style = cssutils.parseStyle(tag["style"]) - - if "background" in style and "url(" in style["background"]: - el_name = "background" - elif "background-image" in style and "url(" in style["background-image"]: - el_name = "background-image" - else: - continue - el = style[el_name] - start = el.find("url(") + 4 - end = el.find(")", start) - url = el[start:end].strip() - - try: - if url in processed: - new_url = url - else: - new_url = self.__download_image(url, host, base_url) - processed[url] = new_url - if new_url: - el = el[:start] + new_url + el[end:] - style[el_name] = el - tag["style"] = style.cssText - except Exception as e: - logging.debug("Internal error", exc_info=e) - logging.error("Failed to download image: %s, skipping...", tag["src"]) + logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e) with open(target_filename, 'w') as file: file.write(str(soup)) def __download_image(self, img_src, host, base_url): - parsed = urlparse(img_src) + parsed = urlsplit(img_src) logging.debug("Downloading image: %s", img_src) @@ -245,36 +255,18 @@ class HTMLDownloadHandler(DownloadHandler): # Not a valid url return None + if parsed.hostname == host: + port = self.cmd.port + elif ":" in parsed.netloc: + port = parsed.netloc.split(":", 1)[1] + else: + port = 80 + if len(parsed.netloc) == 0 and parsed.path != "/": # relative url, append base_url img_src = os.path.join(os.path.dirname(base_url), parsed.path) - parsed = urlparse(img_src) + command = GetCommand(img_src, port, os.path.dirname(self.path)) + command.execute(True) - # Check if the image is located on the same server - if len(parsed.netloc) == 0 or parsed.netloc == host: - same_host = True - img_host = host - img_path = parsed.path - else: - same_host = False - (img_host, img_path) = ResponseHandler.parse_uri(img_src) - - message = f"GET {img_path} HTTP/1.1\r\n" - message += "Accept: */*\r\nAccept-Encoding: identity\r\n" - message += f"Host: {img_host}\r\n\r\n" - message = message.encode(FORMAT) - - if same_host: - client = self.client - client.reset_request() - else: - client = HTTPClient(img_src) - client.conn.connect((img_host, 80)) - client.conn.sendall(message) - filename = self._handle_sub_request(client, img_host + img_path) - - if not same_host: - client.close() - - return filename + return command.filename diff --git a/httplib/MessageParser.py b/httplib/MessageParser.py deleted file mode 100644 index e69de29..0000000 diff --git a/httplib/httpsocket.py b/httplib/httpsocket.py index 7df1fd8..a881760 100644 --- a/httplib/httpsocket.py +++ b/httplib/httpsocket.py @@ -1,6 +1,7 @@ import logging import socket from io import BufferedReader +from typing import Tuple BUFSIZE = 4096 TIMEOUT = 3 @@ -11,7 +12,7 @@ MAXLINE = 4096 class HTTPSocket: host: str conn: socket.socket - file: BufferedReader + file: Tuple[BufferedReader, None] def __init__(self, conn: socket.socket, host: str): @@ -24,8 +25,12 @@ class HTTPSocket: def close(self): self.file.close() + # self.conn.shutdown(socket.SHUT_RDWR) self.conn.close() + def is_closed(self): + return self.file is None + def reset_request(self): self.file.close() self.file = self.conn.makefile("rb") diff --git a/httplib/message.py b/httplib/message.py new file mode 100644 index 0000000..a773368 --- /dev/null +++ b/httplib/message.py @@ -0,0 +1,16 @@ +from typing import Dict + + +class Message: + version: str + status: int + msg: str + headers: Dict[str, str] + body: bytes + + def __init__(self, version: str, status: int, msg: str, headers: Dict[str, str], body: bytes = None): + self.version = version + self.status = status + self.msg = msg + self.headers = headers + self.body = body diff --git a/httplib/parser.py b/httplib/parser.py index 366db86..91c621c 100644 --- a/httplib/parser.py +++ b/httplib/parser.py @@ -1,6 +1,6 @@ import logging import re -from urllib.parse import urlparse +from urllib.parse import urlparse, urlsplit from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest from httplib.httpsocket import HTTPSocket @@ -42,6 +42,26 @@ def get_status_line(client: HTTPSocket): return version, status, reason +def parse_status_line(line: str): + split = list(filter(None, line.strip().split(" ", 2))) + if len(split) < 3: + raise InvalidStatusLine(line) # TODO fix exception + + (http_version, status, reason) = split + + if not _is_valid_http_version(http_version): + raise InvalidStatusLine(line) + version = http_version[:4] + + if not re.match(r"\d{3}", status): + raise InvalidStatusLine(line) + status = int(status) + if status < 100 or status > 999: + raise InvalidStatusLine(line) + + return version, status, reason + + def parse_request_line(client: HTTPSocket): line, (method, target, version) = _get_start_line(client) @@ -119,7 +139,7 @@ def parse_request_headers(client: HTTPSocket): raise BadRequest() headers[key] = value - + return headers @@ -157,6 +177,38 @@ def get_headers(client: HTTPSocket): return result +def parse_headers(lines): + headers = [] + # first header after the status-line may not contain a space + for line in lines: + line = next(lines) + if line[0].isspace(): + continue + else: + break + + for line in lines: + if line in ("\r\n", "\n", " "): + break + + if line[0].isspace(): + headers[-1] = headers[-1].rstrip("\r\n") + + headers.append(line.lstrip()) + + result = {} + header_str = "".join(headers) + for line in header_str.splitlines(): + pos = line.find(":") + + if pos <= 0 or pos >= len(line) - 1: + continue + + (header, value) = map(str.strip, line.split(":", 1)) + check_next_header(result, header, value) + result[header.lower()] = value.lower() + + return result def check_next_header(headers, next_header: str, next_value: str): if next_header == "content-length": @@ -166,3 +218,25 @@ def check_next_header(headers, next_header: str, next_value: str): if not next_value.isnumeric() or int(next_value) <= 0: logging.error("Invalid content-length value: %r", next_value) raise InvalidResponse() + + +def parse_uri(uri: str): + parsed = urlsplit(uri) + + # If there is no netloc, the given string is not a valid URI, so split on / + if parsed.hostname: + host = parsed.hostname + path = parsed.path + if parsed.query != '': + path = f"{path}?{parsed.query}" + else: + (host, path) = uri.split("/", 1) + + if ":" in host: + host, port = host.split(":", 1) + elif parsed.scheme == "https": + port = 443 + else: + port = 80 + + return host, port, path diff --git a/httplib/retriever.py b/httplib/retriever.py index 280a3d6..eaee0cf 100644 --- a/httplib/retriever.py +++ b/httplib/retriever.py @@ -42,6 +42,28 @@ class Retriever(ABC): return ContentLengthRetriever(client, int(content_length)) +class PreambleRetriever(Retriever): + client: HTTPSocket + buffer: [] + + def __init__(self, client: HTTPSocket): + super().__init__(client) + self.client = client + self.buffer = [] + + def retrieve(self): + + line = self.client.read_line() + while True: + self.buffer.append(line) + + if line in ("\r\n", "\n", " "): + break + + yield line + line = self.client.read_line() + + class ContentLengthRetriever(Retriever): length: int @@ -63,21 +85,16 @@ class ContentLengthRetriever(Retriever): buffer = self.client.read(remaining) except TimeoutError: logging.error("Timed out before receiving complete payload") - self.client.close() raise IncompleteResponse("Timed out before receiving complete payload") except ConnectionError: logging.error("Timed out before receiving complete payload") - self.client.close() raise IncompleteResponse("Connection closed before receiving complete payload") - logging.debug("Received payload length: %s", len(buffer)) - if len(buffer) == 0: logging.warning("Received payload length %s less than expected %s", cur_payload_size, self.length) break cur_payload_size += len(buffer) - logging.debug("Processed payload: %r", cur_payload_size) yield buffer return b"" @@ -108,7 +125,6 @@ class ChunkedRetriever(Retriever): yield buffer self.client.read_line() # remove CRLF - return b"" def __get_chunk_size(self): line = self.client.read_line() diff --git a/server/worker.py b/server/worker.py index 497d270..36b2c5d 100644 --- a/server/worker.py +++ b/server/worker.py @@ -1,5 +1,4 @@ import logging -import multiprocessing import multiprocessing as mp import threading from concurrent.futures import ThreadPoolExecutor @@ -69,7 +68,7 @@ class Worker: handler = RequestHandler(conn, self.host) handler.listen() - except Exception as e: + except Exception: logging.debug("Internal error") conn.shutdown(socket.SHUT_RDWR)