From f15ff38f6910102356530b85b77a71b21746991d Mon Sep 17 00:00:00 2001 From: Arthur Bols Date: Thu, 25 Mar 2021 17:56:21 +0100 Subject: [PATCH] client: fix image url parsing --- client.py | 2 +- client/command.py | 4 ++-- client/response_handler.py | 23 +++++++++----------- httplib/message.py | 4 +++- httplib/parser.py | 6 ++++++ server.py | 43 +------------------------------------- server/command.py | 2 +- 7 files changed, 24 insertions(+), 60 deletions(-) diff --git a/client.py b/client.py index 86338ac..6a70aa0 100644 --- a/client.py +++ b/client.py @@ -15,7 +15,7 @@ def main(): arguments = parser.parse_args() - logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose)) + logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose), format="[%(levelname)s] %(message)s") logging.debug("Arguments: %s", arguments) command = cmd.create(arguments.command, arguments.URI, arguments.port) diff --git a/client/command.py b/client/command.py index 573b9b6..41dddba 100644 --- a/client/command.py +++ b/client/command.py @@ -148,9 +148,9 @@ class GetCommand(AbstractCommand): (version, status, msg) = parser.parse_status_line(next(lines)) headers = parser.parse_headers(lines) - logging.debug("---response begin---\r\n%s--- response end---", "".join(retriever.buffer)) + logging.debug("---response begin---\r\n%s---response end---", "".join(retriever.buffer)) - return Message(version, status, msg, headers) + return Message(version, status, msg, headers, retriever.buffer) def _await_response(self, client, retriever): msg = self._get_preamble(retriever) diff --git a/client/response_handler.py b/client/response_handler.py index cfedca7..951bf80 100644 --- a/client/response_handler.py +++ b/client/response_handler.py @@ -88,8 +88,7 @@ class BasicResponseHandler(ResponseHandler): if self.msg.status == 101: # Switching protocols is not supported - print(f"{self.msg.version} {self.msg.status} {self.msg.msg}") - print(self.msg.headers) + print("".join(self.msg.raw), end="") return if 200 <= self.msg.status < 300: @@ -100,8 +99,7 @@ class BasicResponseHandler(ResponseHandler): return self._do_handle_redirect() if 400 <= self.msg.status < 500: # Dump headers and exit with error - print(f"{self.msg.version} {self.msg.status} {self.msg.msg}") - print(self.msg.headers) + print("".join(self.msg.raw), end="") return None def _do_handle_redirect(self): @@ -216,15 +214,14 @@ class HTMLDownloadHandler(DownloadHandler): def _download_images(self, tmp_filename, target_filename): - (host, path) = ResponseHandler.parse_uri(self.cmd.uri) with open(tmp_filename, "rb") as fp: soup = BeautifulSoup(fp, 'lxml') - base_url = self.cmd.uri + base_url = parser.base_url(self.cmd.uri) base_element = soup.find("base") if base_element: - base_url = base_element["href"] + base_url = f"http://{self.cmd.host}" + base_element["href"] processed = {} tag: Tag @@ -236,7 +233,7 @@ class HTMLDownloadHandler(DownloadHandler): if tag["src"] in processed: new_url = processed.get(tag["src"]) else: - new_url = self.__download_image(tag["src"], host, base_url) + new_url = self.__download_image(tag["src"], base_url) processed[tag["src"]] = new_url if new_url: tag["src"] = os.path.basename(new_url) @@ -246,8 +243,8 @@ class HTMLDownloadHandler(DownloadHandler): with open(target_filename, 'w') as file: file.write(str(soup)) - def __download_image(self, img_src, host, base_url): - logging.debug("Downloading image: %s", img_src) + def __download_image(self, img_src, base_url): + logging.info("Downloading image: %s", img_src) parsed = urlsplit(img_src) @@ -257,11 +254,11 @@ class HTMLDownloadHandler(DownloadHandler): if parsed.hostname is None: if img_src[0] == "/": - img_src = host + img_src + img_src = f"http://{self.cmd.host}{img_src}" else: - img_src = os.path.join(os.path.dirname(base_url), img_src) + img_src = os.path.join(base_url, img_src) - if parsed.hostname is None or parsed.hostname == host: + if parsed.hostname is None or parsed.hostname == self.cmd.host: port = self.cmd.port elif ":" in parsed.netloc: port = parsed.netloc.split(":", 1)[1] diff --git a/httplib/message.py b/httplib/message.py index a773368..80fcb14 100644 --- a/httplib/message.py +++ b/httplib/message.py @@ -6,11 +6,13 @@ class Message: status: int msg: str headers: Dict[str, str] + raw: str body: bytes - def __init__(self, version: str, status: int, msg: str, headers: Dict[str, str], body: bytes = None): + def __init__(self, version: str, status: int, msg: str, headers: Dict[str, str], raw=None, body: bytes = None): self.version = version self.status = status self.msg = msg self.headers = headers + self.raw = raw self.body = body diff --git a/httplib/parser.py b/httplib/parser.py index 0404421..3bb663d 100644 --- a/httplib/parser.py +++ b/httplib/parser.py @@ -245,3 +245,9 @@ def parse_uri(uri: str): port = 80 return host, port, path + + +def base_url(uri: str): + parsed = urlsplit(uri) + path = parsed.path.rsplit("/", 1)[0] + return f"{parsed.scheme}://{parsed.hostname}{path}/" diff --git a/server.py b/server.py index 8abaa00..2bbddc9 100644 --- a/server.py +++ b/server.py @@ -46,45 +46,4 @@ try: except Exception as e: print("[ABRT] Internal error: " + str(e), file=sys.stderr) logging.debug("Internal error", exc_info=e) - sys.exit(70) - -# import socket -# -# # Get hostname and address -# hostname = socket.gethostname() -# address = socket.gethostbyname(hostname) -# -# # socket heeft een listening and accept method -# -# SERVER = "127.0.0.1" # dynamisch fixen in project -# PORT = 5055 -# server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) -# -# ADDR = (SERVER, PORT) # hier wordt de socket gebonden aan mijn IP adres, dit moet wel anders -# server.bind(ADDR) # in het project gebeuren -# -# HEADER = 64 # maximum size messages -# FORMAT = 'utf-8' -# DISCONNECT_MESSAGE = "DISCONNECT!" # special message for disconnecting client and server -# -# -# # function for starting server -# def start(): -# pass -# server.listen() -# while True: # infinite loop in which server accept incoming connections, we want to run it forever -# conn, addr = server.accept() # Server blocks untill a client connects -# print("new connection: ", addr[0], " connected.") -# connected = True -# while connected: # while client is connected, we want to recieve messages -# msg = conn.recv(HEADER).decode( -# FORMAT).rstrip() # Argument is maximum size of msg (in project look into details of accp), decode is for converting bytes to strings, rstrip is for stripping messages for special hidden characters -# print("message: ", msg) -# if msg == DISCONNECT_MESSAGE: -# connected = False -# print("close connection ", addr[0], " disconnected.") -# conn.close() -# -# -# print("server is starting ... ") -# start() + sys.exit(70) \ No newline at end of file diff --git a/server/command.py b/server/command.py index 548c542..adbfa3c 100644 --- a/server/command.py +++ b/server/command.py @@ -77,7 +77,7 @@ class GetCommand(AbstractCommand): logging.debug("---response begin---\r\n%s--- response end---", "".join(retriever.buffer)) - return Message(version, status, msg, headers) + return Message(version, status, msg, headers, retriever.buffer) def _await_response(self, client, retriever): msg = self._get_preamble(retriever)