diff --git a/client/command.py b/client/command.py index f525c2d..9fec1b9 100644 --- a/client/command.py +++ b/client/command.py @@ -13,19 +13,30 @@ sockets: Dict[str, HTTPClient] = {} def create(command: str, url: str, port): + """ + Create a corresponding Command instance of the specified HTTP `command` with the specified `url` and `port`. + @param command: The command type to create + @param url: The url for the command + @param port: The port for the command + """ + + uri = parser.get_uri(url) if command == "GET": - return GetCommand(url, port) + return GetCommand(uri, port) elif command == "HEAD": - return HeadCommand(url, port) + return HeadCommand(uri, port) elif command == "POST": - return PostCommand(url, port) + return PostCommand(uri, port) elif command == "PUT": - return PutCommand(url, port) + return PutCommand(uri, port) else: raise ValueError() class AbstractCommand(ABC): + """ + A class representing the command for sending an HTTP command. + """ uri: str host: str path: str @@ -111,6 +122,9 @@ class AbstractCommand(ABC): class AbstractWithBodyCommand(AbstractCommand, ABC): + """ + The building block for creating an HTTP message for an HTTP command with a body. + """ def _build_message(self, message: str) -> bytes: body = input(f"Enter {self.command} data: ").encode(FORMAT) @@ -127,12 +141,19 @@ class AbstractWithBodyCommand(AbstractCommand, ABC): class HeadCommand(AbstractCommand): + """ + A Command for sending a `HEAD` message. + """ + @property def command(self): return "HEAD" class GetCommand(AbstractCommand): + """ + A Command for sending a `GET` message. + """ def __init__(self, uri: str, port, dir=None): super().__init__(uri, port) @@ -160,12 +181,20 @@ class GetCommand(AbstractCommand): class PostCommand(AbstractWithBodyCommand): + """ + A command for sending a `POST` command. + """ + @property def command(self): return "POST" class PutCommand(AbstractWithBodyCommand): + """ + A command for sending a `PUT` command. + """ + @property def command(self): return "PUT" diff --git a/client/httpclient.py b/client/httpclient.py index e0f23bc..68d8b71 100644 --- a/client/httpclient.py +++ b/client/httpclient.py @@ -1,6 +1,6 @@ import socket -from httplib.httpsocket import HTTPSocket +from httplib.httpsocket import HTTPSocket, InvalidResponse BUFSIZE = 4096 TIMEOUT = 3 @@ -13,3 +13,9 @@ class HTTPClient(HTTPSocket): def __init__(self, host: str): super().__init__(socket.socket(socket.AF_INET, socket.SOCK_STREAM), host) + + def read_line(self): + try: + return super().read_line() + except UnicodeDecodeError: + raise InvalidResponse("Unexpected decoding error") diff --git a/client/response_handler.py b/client/response_handler.py index c163a56..d725bef 100644 --- a/client/response_handler.py +++ b/client/response_handler.py @@ -14,7 +14,7 @@ from httplib.message import ClientMessage as Message from httplib.retriever import Retriever -def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None): +def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None): handler = BasicResponseHandler(client, msg, command) retriever = handler.handle() @@ -23,9 +23,9 @@ def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None) content_type = msg.headers.get("content-type") if content_type and "text/html" in content_type: - handler = HTMLDownloadHandler(retriever, client, msg, command, dir) + handler = HTMLDownloadHandler(retriever, client, msg, command, directory) else: - handler = RawDownloadHandler(retriever, client, msg, command, dir) + handler = RawDownloadHandler(retriever, client, msg, command, directory) return handler.handle() @@ -130,20 +130,20 @@ class BasicResponseHandler(ResponseHandler): class DownloadHandler(ResponseHandler, ABC): - def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None): + def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, directory=None): super().__init__(retriever, client, msg, cmd) - if not dir: - dir = self._create_directory() + if not directory: + directory = self._create_directory() - self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename())) + self.path = self._get_duplicate_name(os.path.join(directory, self.get_filename())) @staticmethod - def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None): + def create(retriever: Retriever, client: HTTPClient, msg, cmd, directory=None): content_type = msg.headers.get("content-type") if content_type and "text/html" in content_type: - return HTMLDownloadHandler(retriever, client, msg, cmd, dir) - return RawDownloadHandler(retriever, client, msg, cmd, dir) + return HTMLDownloadHandler(retriever, client, msg, cmd, directory) + return RawDownloadHandler(retriever, client, msg, cmd, directory) def _create_directory(self): path = self._get_duplicate_name(os.path.abspath(self.client.host)) @@ -194,14 +194,14 @@ class RawDownloadHandler(DownloadHandler): class HTMLDownloadHandler(DownloadHandler): - def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None): - super().__init__(retriever, client, msg, cmd, dir) + def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, directory=None): + super().__init__(retriever, client, msg, cmd, directory) def handle(self) -> str: - (dir, file) = os.path.split(self.path) + (directory, file) = os.path.split(self.path) tmp_filename = f".{file}.tmp" - tmp_path = os.path.join(dir, tmp_filename) + tmp_path = os.path.join(directory, tmp_filename) file = open(tmp_path, "wb") for buffer in self.retriever.retrieve(): @@ -217,11 +217,11 @@ class HTMLDownloadHandler(DownloadHandler): with open(tmp_filename, "rb") as fp: soup = BeautifulSoup(fp, 'lxml') - base_url = parser.base_url(self.cmd.uri) base_element = soup.find("base") + base_url = self.cmd.uri if base_element: - base_url = f"http://{self.cmd.host}" + base_element["href"] + base_url = parser.urljoin(self.cmd.uri, base_element["href"]) processed = {} tag: Tag @@ -241,22 +241,18 @@ class HTMLDownloadHandler(DownloadHandler): logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e) with open(target_filename, 'w') as file: - file.write(str(soup)) + file.write(soup.prettify(formatter="minimal")) def __download_image(self, img_src, base_url): + """ + Download image from the specified `img_src` and `base_url`. + If the image is available, it will be downloaded to the directory of `self.path` + """ + logging.info("Downloading image: %s", img_src) parsed = urlsplit(img_src) - - if parsed.scheme not in ("", "http", "https"): - # Not a valid url - return None - - if parsed.hostname is None: - if img_src[0] == "/": - img_src = f"http://{self.cmd.host}{img_src}" - else: - img_src = parser.absolute_url(base_url, img_src) + img_src = parser.urljoin(base_url, img_src) if parsed.hostname is None or parsed.hostname == self.cmd.host: port = self.cmd.port diff --git a/httplib/exceptions.py b/httplib/exceptions.py index 55bd1fc..1257718 100644 --- a/httplib/exceptions.py +++ b/httplib/exceptions.py @@ -17,7 +17,7 @@ class InvalidStatusLine(HTTPException): class UnsupportedEncoding(HTTPException): - """ Reponse Encoding not support """ + """ Encoding not supported """ def __init(self, enc_type, encoding): self.enc_type = enc_type @@ -39,12 +39,28 @@ class HTTPServerException(Exception): self.body = body -class BadRequest(HTTPServerException): +class HTTPServerCloseException(HTTPServerException): + """ When thrown, the connection should be closed """ + + +class BadRequest(HTTPServerCloseException): """ Malformed HTTP request""" status_code = 400 message = "Bad Request" +class Forbidden(HTTPServerException): + """ Request not allowed """ + status_code = 403 + message = "Forbidden" + + +class NotFound(HTTPServerException): + """ Resource not found """ + status_code = 404 + message = "Not Found" + + class MethodNotAllowed(HTTPServerException): """ Method is not allowed """ status_code = 405 @@ -54,7 +70,7 @@ class MethodNotAllowed(HTTPServerException): self.allowed_methods = allowed_methods -class InternalServerError(HTTPServerException): +class InternalServerError(HTTPServerCloseException): """ Internal Server Error """ status_code = 500 message = "Internal Server Error" @@ -66,16 +82,10 @@ class NotImplemented(HTTPServerException): message = "Not Implemented" -class NotFound(HTTPServerException): - """ Resource not found """ - status_code = 404 - message = "Not Found" - - -class Forbidden(HTTPServerException): - """ Request not allowed """ - status_code = 403 - message = "Forbidden" +class HTTPVersionNotSupported(HTTPServerCloseException): + """ The server does not support the major version HTTP used in the request message """ + status_code = 505 + message = "HTTP Version Not Supported" class Conflict(HTTPServerException): @@ -84,10 +94,10 @@ class Conflict(HTTPServerException): message = "Conflict" -class HTTPVersionNotSupported(HTTPServerException): - """ The server does not support the major version HTTP used in the request message """ - status_code = 505 - message = "HTTP Version Not Supported" +class NotModified(HTTPServerException): + """ Requested resource was not modified """ + status_code = 304 + message = "Not Modified" class InvalidRequestLine(BadRequest): diff --git a/httplib/httpsocket.py b/httplib/httpsocket.py index 2dc1372..f070f71 100644 --- a/httplib/httpsocket.py +++ b/httplib/httpsocket.py @@ -26,42 +26,26 @@ class HTTPSocket: self.file = self.conn.makefile("rb") def close(self): + """ + Close this socket + """ self.file.close() - # self.conn.shutdown(socket.SHUT_RDWR) self.conn.close() def is_closed(self): return self.file is None def reset_request(self): + """ + Close the file handle of this socket and create a new one. + """ self.file.close() self.file = self.conn.makefile("rb") - def __do_receive(self): - if self.conn.fileno() == -1: - raise Exception("Connection closed") - - result = self.conn.recv(BUFSIZE) - return result - - def receive(self): - """Receive data from the client up to BUFSIZE - """ - count = 0 - while True: - count += 1 - try: - return self.__do_receive() - except socket.timeout: - logging.debug("Socket receive timed out after %s seconds", TIMEOUT) - if count == 3: - break - logging.debug("Retrying %s", count) - - logging.debug("Timed out after waiting %s seconds for response", TIMEOUT * count) - raise TimeoutError("Request timed out") - def read(self, size=BUFSIZE, blocking=True) -> bytes: + """ + Read bytes up to the specified buffer size. This method will block when `blocking` is set to True (Default). + """ if blocking: buffer = self.file.read(size) else: @@ -72,14 +56,18 @@ class HTTPSocket: return buffer def read_line(self): - try: - line = str(self.read_bytes_line(), FORMAT) - except UnicodeDecodeError: - # Expected UTF-8 - raise BadRequest() - return line + """ + Read a line decoded as `httpsocket.FORMAT`. + @return: the decoded line + @raise: UnicodeDecodeError + """ + return str(self.read_bytes_line(), FORMAT) def read_bytes_line(self) -> bytes: + """ + Read a line as bytes. + """ + line = self.file.readline(MAXLINE + 1) if len(line) > MAXLINE: raise InvalidResponse("Line too long") diff --git a/httplib/message.py b/httplib/message.py index 16a9545..d21c5d6 100644 --- a/httplib/message.py +++ b/httplib/message.py @@ -23,6 +23,7 @@ class ClientMessage(Message): def __init__(self, version: str, status: int, msg: str, headers: Dict[str, str], raw=None, body: bytes = None): super().__init__(version, headers, raw, body) self.status = status + self.msg = msg class ServerMessage(Message): diff --git a/httplib/parser.py b/httplib/parser.py index 32b04c1..02a290d 100644 --- a/httplib/parser.py +++ b/httplib/parser.py @@ -1,6 +1,7 @@ import logging import os.path import re +import urllib from urllib.parse import urlparse, urlsplit from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest, InvalidRequestLine @@ -255,6 +256,19 @@ def parse_uri(uri: str): return host, port, path +def get_uri(url: str): + """ + Returns a valid URI of the specified URL. + """ + parsed = urlsplit(url) + + result = f"http://{parsed.netloc}{parsed.path}" + if parsed.query != '': + result = f"{result}?{parsed.query}" + + return result + + def base_url(uri: str): parsed = urlsplit(uri) path = parsed.path.rsplit("/", 1)[0] @@ -265,3 +279,7 @@ def absolute_url(uri: str, rel_path: str): parsed = urlsplit(uri) path = os.path.normpath(os.path.join(parsed.path, rel_path)) return f"{parsed.scheme}://{parsed.hostname}{path}" + + +def urljoin(base, url): + return urllib.parse.urljoin(base, url) diff --git a/public/index.html b/public/index.html index 225dbac..64a5752 100644 --- a/public/index.html +++ b/public/index.html @@ -48,6 +48,7 @@