From 9ba7a030a747d5939ed245ab9184bf554aa7400e Mon Sep 17 00:00:00 2001 From: Arthur Bols Date: Mon, 22 Mar 2021 04:12:13 +0100 Subject: [PATCH] small fixes --- client/command.py | 13 ++++++++++++- client/response_handler.py | 36 ++++++++++++++++++++++++------------ httplib/parser.py | 2 +- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/client/command.py b/client/command.py index d0f583d..bce53d8 100644 --- a/client/command.py +++ b/client/command.py @@ -31,6 +31,15 @@ class AbstractCommand(ABC): else: raise ValueError() + + @staticmethod + def build_message(command, host, path): + message = f"{command} {path} HTTP/1.1\r\n" + message += f"Host: {host}\r\n" + message += "Accept: */*\r\nAccept-Encoding: identity\r\n" + + return message.encode(FORMAT) + def execute(self): (host, path) = self.parse_uri() @@ -40,7 +49,6 @@ class AbstractCommand(ABC): message = f"{self.command} {path} HTTP/1.1\r\n" message += f"Host: {host}\r\n" message += "Accept: */*\r\nAccept-Encoding: identity\r\n" - message += "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0\r\n" encoded_msg = self._build_message(message) logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT)) @@ -94,6 +102,9 @@ class AbstractCommand(ABC): class AbstractWithBodyCommand(AbstractCommand, ABC): + @staticmethod + def build_message(command, host, path): + message = AbstractCommand.build_message() def _build_message(self, message: str) -> bytes: body = input(f"Enter {self.command} data: ").encode(FORMAT) print() diff --git a/client/response_handler.py b/client/response_handler.py index f255983..1ab4247 100644 --- a/client/response_handler.py +++ b/client/response_handler.py @@ -1,8 +1,9 @@ import logging import os +import re from abc import ABC, abstractmethod from typing import Dict -from urllib.parse import urlparse +from urllib.parse import urlparse, unquote import cssutils from bs4 import BeautifulSoup, Tag @@ -107,6 +108,11 @@ class DownloadHandler(ResponseHandler, ABC): elif parsed.path[-1] != "/": filename = parsed.path[index:] + while "%" in filename: + filename = unquote(filename) + + filename = re.sub(r"[^\w.+-]+[.]*", '', filename) + result = os.path.basename(filename).strip() if any(letter.isalnum() for letter in result): return result @@ -152,7 +158,7 @@ class HTMLDownloadHandler(DownloadHandler): def handle(self) -> str: (dir, file) = os.path.split(self.path) - tmp_filename = ".{file}.tmp".format(file=file) + tmp_filename = f".{file}.tmp" tmp_path = os.path.join(dir, tmp_filename) file = open(tmp_path, "wb") @@ -180,16 +186,22 @@ class HTMLDownloadHandler(DownloadHandler): tag: Tag for tag in soup.find_all("img"): try: - if tag["src"] in processed: - new_url = processed.get(tag["src"]) + if tag.has_attr("src"): + el_name = "src" + elif tag.has_attr("data-src"): + el_name = "data-src" else: - new_url = self.__download_image(tag["src"], host, base_url) - processed[tag["src"]] = new_url + continue + + if tag[el_name] in processed: + new_url = processed.get(tag[el_name]) + else: + new_url = self.__download_image(tag[el_name], host, base_url) + processed[tag[el_name]] = new_url if new_url: - tag["src"] = new_url + tag[el_name] = new_url except Exception as e: - logging.debug(e) - logging.error("Failed to download image: %s, skipping...", tag["src"]) + logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e) for tag in soup.find_all("div"): if not tag.has_attr("style"): @@ -229,7 +241,7 @@ class HTMLDownloadHandler(DownloadHandler): logging.debug("Downloading image: %s", img_src) - if parsed.scheme not in ("", "http"): + if parsed.scheme not in ("", "http", "https"): # Not a valid url return None @@ -248,9 +260,9 @@ class HTMLDownloadHandler(DownloadHandler): same_host = False (img_host, img_path) = ResponseHandler.parse_uri(img_src) - message = "GET {path} HTTP/1.1\r\n".format(path=img_path) + message = f"GET {img_path} HTTP/1.1\r\n" message += "Accept: */*\r\nAccept-Encoding: identity\r\n" - message += "Host: {host}\r\n\r\n".format(host=host) + message += f"Host: {img_host}\r\n\r\n" message = message.encode(FORMAT) if same_host: diff --git a/httplib/parser.py b/httplib/parser.py index 664057f..366db86 100644 --- a/httplib/parser.py +++ b/httplib/parser.py @@ -8,7 +8,7 @@ from httplib.httpsocket import HTTPSocket def _get_start_line(client: HTTPSocket): line = client.read_line().strip() - split = list(filter(None, line.split(" "))) + split = list(filter(None, line.split(" ", 2))) if len(split) < 3: raise InvalidStatusLine(line) # TODO fix exception