diff --git a/client/command.py b/client/command.py index 15ea135..14fe48c 100644 --- a/client/command.py +++ b/client/command.py @@ -3,39 +3,40 @@ from abc import ABC, abstractmethod from typing import Dict, Tuple from urllib.parse import urlparse -from client.httpclient import FORMAT, HTTPClient +from client.httpclient import HTTPClient from httplib import parser from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding +from httplib.httpsocket import FORMAT from httplib.message import ClientMessage as Message from httplib.retriever import PreambleRetriever sockets: Dict[str, HTTPClient] = {} -def create(command: str, url: str, port): +def create(method: str, url: str, port): """ - Create a corresponding Command instance of the specified HTTP `command` with the specified `url` and `port`. - @param command: The command type to create + Create a corresponding Command instance of the specified HTTP `method` with the specified `url` and `port`. + @param method: The command type to create @param url: The url for the command @param port: The port for the command """ uri = parser.get_uri(url) - if command == "GET": + if method == "GET": return GetCommand(uri, port) - elif command == "HEAD": + elif method == "HEAD": return HeadCommand(uri, port) - elif command == "POST": + elif method == "POST": return PostCommand(uri, port) - elif command == "PUT": + elif method == "PUT": return PutCommand(uri, port) else: - raise ValueError() + raise ValueError("Unknown HTTP method") class AbstractCommand(ABC): """ - A class representing the command for sending an HTTP command. + A class representing the command for sending an HTTP request. """ uri: str host: str @@ -51,10 +52,15 @@ class AbstractCommand(ABC): @property @abstractmethod - def command(self): + def method(self): pass def execute(self, sub_request=False): + """ + Creates and sends the HTTP message for this Command. + + @param sub_request: If this execution is in function of a prior command. + """ self.sub_request = sub_request (host, path) = self.parse_uri() @@ -69,9 +75,10 @@ class AbstractCommand(ABC): client.conn.connect((host, self.port)) sockets[host] = client - message = f"{self.command} {path} HTTP/1.1\r\n" + message = f"{self.method} {path} HTTP/1.1\r\n" message += f"Host: {host}:{self.port}\r\n" - message += "Accept: */*\r\nAccept-Encoding: identity\r\n" + message += "Accept: */*\r\n" + message += "Accept-Encoding: identity\r\n" encoded_msg = self._build_message(message) logging.debug("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT)) @@ -81,8 +88,7 @@ class AbstractCommand(ABC): logging.info("HTTP request sent, awaiting response...") try: - retriever = PreambleRetriever(client) - self._await_response(client, retriever) + self._await_response(client) except InvalidResponse as e: logging.debug("Internal error: Response could not be parsed", exc_info=e) return @@ -95,7 +101,12 @@ class AbstractCommand(ABC): if not sub_request: client.close() - def _await_response(self, client, retriever): + def _await_response(self, client): + """ + Simple response method. + + Receives the response and prints to stdout. + """ while True: line = client.read_line() print(line, end="") @@ -106,11 +117,15 @@ class AbstractCommand(ABC): return (message + "\r\n").encode(FORMAT) def parse_uri(self): + """ + Parses the URI and returns the hostname and path. + @return: A tuple of the hostname and path. + """ parsed = urlparse(self.uri) # If there is no netloc, the url is invalid, so prepend `//` and try again if parsed.netloc == "": - parsed = urlparse("//" + self.uri) + parsed = urlparse("http://" + self.uri) host = parsed.netloc path = parsed.path @@ -126,11 +141,11 @@ class AbstractCommand(ABC): class AbstractWithBodyCommand(AbstractCommand, ABC): """ - The building block for creating an HTTP message for an HTTP command with a body. + The building block for creating an HTTP message for an HTTP method with a body (POST and PUT). """ def _build_message(self, message: str) -> bytes: - body = input(f"Enter {self.command} data: ").encode(FORMAT) + body = input(f"Enter {self.method} data: ").encode(FORMAT) print() message += "Content-Type: text/plain\r\n" @@ -145,29 +160,36 @@ class AbstractWithBodyCommand(AbstractCommand, ABC): class HeadCommand(AbstractCommand): """ - A Command for sending a `HEAD` message. + A Command for sending a `HEAD` request. """ @property - def command(self): + def method(self): return "HEAD" class GetCommand(AbstractCommand): """ - A Command for sending a `GET` message. + A Command for sending a `GET` request. """ + dir: str - def __init__(self, uri: str, port, dir=None): + def __init__(self, uri: str, port, directory=None): super().__init__(uri, port) - self.dir = dir + self.dir = directory self.filename = None @property - def command(self): + def method(self): return "GET" - def _get_preamble(self, retriever): + def _get_preamble(self, client): + """ + Returns the preamble (start-line and headers) of the response of this command. + @param client: the client object to retrieve from + @return: A Message object containing the HTTP-version, status code, status message, headers and buffer + """ + retriever = PreambleRetriever(client) lines = retriever.retrieve() (version, status, msg) = parser.parse_status_line(next(lines)) headers = parser.parse_headers(lines) @@ -177,8 +199,11 @@ class GetCommand(AbstractCommand): return Message(version, status, msg, headers, buffer) - def _await_response(self, client, retriever): - msg = self._get_preamble(retriever) + def _await_response(self, client): + """ + Handles the response of this command. + """ + msg = self._get_preamble(client) from client import response_handler self.filename = response_handler.handle(client, msg, self, self.dir) @@ -186,19 +211,19 @@ class GetCommand(AbstractCommand): class PostCommand(AbstractWithBodyCommand): """ - A command for sending a `POST` command. + A command for sending a `POST` request. """ @property - def command(self): + def method(self): return "POST" class PutCommand(AbstractWithBodyCommand): """ - A command for sending a `PUT` command. + A command for sending a `PUT` request. """ @property - def command(self): + def method(self): return "PUT" diff --git a/client/httpclient.py b/client/httpclient.py index 68d8b71..52c21b4 100644 --- a/client/httpclient.py +++ b/client/httpclient.py @@ -2,11 +2,6 @@ import socket from httplib.httpsocket import HTTPSocket, InvalidResponse -BUFSIZE = 4096 -TIMEOUT = 3 -FORMAT = "UTF-8" -MAXLINE = 4096 - class HTTPClient(HTTPSocket): host: str diff --git a/client/response_handler.py b/client/response_handler.py index a98292f..a36c419 100644 --- a/client/response_handler.py +++ b/client/response_handler.py @@ -4,15 +4,17 @@ import re from abc import ABC, abstractmethod from urllib.parse import urlsplit, unquote -from bs4 import BeautifulSoup, Tag - from client.command import AbstractCommand, GetCommand -from client.httpclient import HTTPClient, FORMAT +from client.httpclient import HTTPClient from httplib import parser from httplib.exceptions import InvalidResponse +from httplib.httpsocket import FORMAT from httplib.message import ClientMessage as Message from httplib.retriever import Retriever +BASE_REGEX = re.compile(r"<\s*base.*href\s*=\s*['\"](\S*)['\"][^>]*>", re.M | re.I) +IMG_REGEX = re.compile(r"<\s*img[^>]*\ssrc\s*=\s*['\"]([^\"']+)['\"][^>]*>", re.M | re.I) + def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None): handler = BasicResponseHandler(client, msg, command) @@ -83,8 +85,10 @@ class BasicResponseHandler(ResponseHandler): if 300 <= self.msg.status < 400: # Redirect + self._skip_body() return self._do_handle_redirect() if 400 <= self.msg.status < 600: + self._skip_body() # Dump headers and exit with error if not self.cmd.sub_request: print("".join(self.msg.raw), end="") @@ -93,8 +97,6 @@ class BasicResponseHandler(ResponseHandler): return None def _do_handle_redirect(self): - self._skip_body() - if self.msg.status == 304: print("".join(self.msg.raw), end="") return None @@ -203,40 +205,61 @@ class HTMLDownloadHandler(DownloadHandler): file.write(buffer) file.close() - self._download_images(tmp_path, self.path) + charset = parser.get_charset(self.msg.headers) + self._download_images(tmp_path, self.path, charset) os.remove(tmp_path) return self.path - def _download_images(self, tmp_filename, target_filename): + def _download_images(self, tmp_filename, target_filename, charset=FORMAT): - with open(tmp_filename, "rb") as fp: - soup = BeautifulSoup(fp, 'lxml') + try: + fp = open(tmp_filename, "r", encoding=charset) + html = fp.read() + except UnicodeDecodeError: + fp = open(tmp_filename, "r", encoding=FORMAT, errors="replace") + html = fp.read() - base_element = soup.find("base") + fp.close() - base_url = self.cmd.uri - if base_element: - base_url = parser.urljoin(self.cmd.uri, base_element["href"]) + base_element = BASE_REGEX.search(html) + base_url = self.cmd.uri + if base_element: + base_url = parser.urljoin(self.cmd.uri, base_element.group(1)) - processed = {} - tag: Tag - for tag in soup.find_all("img"): - try: - if not tag.has_attr("src"): + processed = {} + to_replace = [] + + for m in IMG_REGEX.finditer(html): + url_start = m.start(1) + url_end = m.end(1) + target = m.group(1) + + try: + if len(target) == 0: + continue + if target in processed: + new_url = processed.get(target) + else: + new_url = self.__download_image(target, base_url) + if not new_url: + # Image failed to download continue - if tag["src"] in processed: - new_url = processed.get(tag["src"]) - else: - new_url = self.__download_image(tag["src"], base_url) - processed[tag["src"]] = new_url - if new_url: - tag["src"] = os.path.basename(new_url) - except Exception as e: - logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e) + processed[target] = new_url - with open(target_filename, 'w') as file: - file.write(soup.prettify(formatter="minimal")) + if new_url: + local_path = os.path.basename(new_url) + to_replace.append((url_start, url_end, local_path)) + + except Exception as e: + logging.error("Failed to download image: %s, skipping...", target, exc_info=e) + + to_replace.reverse() + for (start, end, path) in to_replace: + html = html[:start] + path + html[end:] + + with open(target_filename, 'w', encoding=FORMAT) as file: + file.write(html) def __download_image(self, img_src, base_url): """ diff --git a/httplib/parser.py b/httplib/parser.py index b928038..661395c 100644 --- a/httplib/parser.py +++ b/httplib/parser.py @@ -1,9 +1,11 @@ import logging import re import urllib +from typing import Dict from urllib.parse import urlparse, urlsplit from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest, InvalidRequestLine +from httplib.httpsocket import FORMAT def _is_valid_http_version(http_version: str): @@ -164,7 +166,7 @@ def get_uri(url: str): parsed = urlsplit(url) result = f"http://{parsed.netloc}{parsed.path}" - if parsed.query != '': + if parsed.query != "": result = f"{result}?{parsed.query}" return result @@ -175,3 +177,13 @@ def urljoin(base, url): Join a base url and a URL to form a absolute url. """ return urllib.parse.urljoin(base, url) + + +def get_charset(headers: Dict[str, str]): + if "content-type" in headers: + content_type = headers["content-type"] + match = re.search(r"charset\s*=\s*([a-z\-0-9]*)", content_type, re.I) + if match: + return match.group(1) + + return FORMAT diff --git a/requirements.txt b/requirements.txt index a7db947..cd4ef97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1 @@ -beautifulsoup4~=4.9.3 -lxml~=4.6.2 -cssutils~=2.2.0 \ No newline at end of file +lxml~=4.6.2 \ No newline at end of file diff --git a/server/command.py b/server/command.py index dbd8dec..eef4f96 100644 --- a/server/command.py +++ b/server/command.py @@ -6,12 +6,12 @@ from datetime import datetime from time import mktime from wsgiref.handlers import format_date_time -from client.httpclient import FORMAT from httplib import parser from httplib.exceptions import NotFound, Forbidden, NotModified +from httplib.httpsocket import FORMAT from httplib.message import ServerMessage as Message -root = os.path.join(os.path.dirname(sys.argv[0]), "public") +CONTENT_ROOT = os.path.join(os.path.dirname(sys.argv[0]), "public") status_message = { 200: "OK", @@ -26,6 +26,12 @@ status_message = { def create(message: Message): + """ + Creates a Command based on the specified message + @param message: the message to create the Command with. + @return: An instance of `AbstractCommand` + """ + if message.method == "GET": return GetCommand(message) elif message.method == "HEAD": @@ -102,9 +108,9 @@ class AbstractCommand(ABC): norm_path = os.path.normpath(self.msg.target.path) if norm_path == "/": - path = root + "/index.html" + path = CONTENT_ROOT + "/index.html" else: - path = root + norm_path + path = CONTENT_ROOT + norm_path if check and not os.path.exists(path): raise NotFound(path) @@ -169,7 +175,7 @@ class AbstractModifyCommand(AbstractCommand, ABC): else: status = 201 - location = parser.urljoin("/", os.path.relpath(path, root)) + location = parser.urljoin("/", os.path.relpath(path, CONTENT_ROOT)) return self._build_message(status, "text/plain", b"", {"Location": location}) diff --git a/server/serversocket.py b/server/serversocket.py index 896b41d..08edd49 100644 --- a/server/serversocket.py +++ b/server/serversocket.py @@ -3,11 +3,6 @@ import socket from httplib.exceptions import BadRequest from httplib.httpsocket import HTTPSocket -BUFSIZE = 4096 -TIMEOUT = 3 -FORMAT = "UTF-8" -MAXLINE = 4096 - class ServerSocket(HTTPSocket):