small fixes

This commit is contained in:
2021-03-22 04:12:13 +01:00
parent 42f1661e0a
commit 9ba7a030a7
3 changed files with 37 additions and 14 deletions

View File

@@ -31,6 +31,15 @@ class AbstractCommand(ABC):
else: else:
raise ValueError() raise ValueError()
@staticmethod
def build_message(command, host, path):
message = f"{command} {path} HTTP/1.1\r\n"
message += f"Host: {host}\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
return message.encode(FORMAT)
def execute(self): def execute(self):
(host, path) = self.parse_uri() (host, path) = self.parse_uri()
@@ -40,7 +49,6 @@ class AbstractCommand(ABC):
message = f"{self.command} {path} HTTP/1.1\r\n" message = f"{self.command} {path} HTTP/1.1\r\n"
message += f"Host: {host}\r\n" message += f"Host: {host}\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n" message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0\r\n"
encoded_msg = self._build_message(message) encoded_msg = self._build_message(message)
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT)) logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
@@ -94,6 +102,9 @@ class AbstractCommand(ABC):
class AbstractWithBodyCommand(AbstractCommand, ABC): class AbstractWithBodyCommand(AbstractCommand, ABC):
@staticmethod
def build_message(command, host, path):
message = AbstractCommand.build_message()
def _build_message(self, message: str) -> bytes: def _build_message(self, message: str) -> bytes:
body = input(f"Enter {self.command} data: ").encode(FORMAT) body = input(f"Enter {self.command} data: ").encode(FORMAT)
print() print()

View File

@@ -1,8 +1,9 @@
import logging import logging
import os import os
import re
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict from typing import Dict
from urllib.parse import urlparse from urllib.parse import urlparse, unquote
import cssutils import cssutils
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@@ -107,6 +108,11 @@ class DownloadHandler(ResponseHandler, ABC):
elif parsed.path[-1] != "/": elif parsed.path[-1] != "/":
filename = parsed.path[index:] filename = parsed.path[index:]
while "%" in filename:
filename = unquote(filename)
filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
result = os.path.basename(filename).strip() result = os.path.basename(filename).strip()
if any(letter.isalnum() for letter in result): if any(letter.isalnum() for letter in result):
return result return result
@@ -152,7 +158,7 @@ class HTMLDownloadHandler(DownloadHandler):
def handle(self) -> str: def handle(self) -> str:
(dir, file) = os.path.split(self.path) (dir, file) = os.path.split(self.path)
tmp_filename = ".{file}.tmp".format(file=file) tmp_filename = f".{file}.tmp"
tmp_path = os.path.join(dir, tmp_filename) tmp_path = os.path.join(dir, tmp_filename)
file = open(tmp_path, "wb") file = open(tmp_path, "wb")
@@ -180,16 +186,22 @@ class HTMLDownloadHandler(DownloadHandler):
tag: Tag tag: Tag
for tag in soup.find_all("img"): for tag in soup.find_all("img"):
try: try:
if tag["src"] in processed: if tag.has_attr("src"):
new_url = processed.get(tag["src"]) el_name = "src"
elif tag.has_attr("data-src"):
el_name = "data-src"
else: else:
new_url = self.__download_image(tag["src"], host, base_url) continue
processed[tag["src"]] = new_url
if tag[el_name] in processed:
new_url = processed.get(tag[el_name])
else:
new_url = self.__download_image(tag[el_name], host, base_url)
processed[tag[el_name]] = new_url
if new_url: if new_url:
tag["src"] = new_url tag[el_name] = new_url
except Exception as e: except Exception as e:
logging.debug(e) logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
for tag in soup.find_all("div"): for tag in soup.find_all("div"):
if not tag.has_attr("style"): if not tag.has_attr("style"):
@@ -229,7 +241,7 @@ class HTMLDownloadHandler(DownloadHandler):
logging.debug("Downloading image: %s", img_src) logging.debug("Downloading image: %s", img_src)
if parsed.scheme not in ("", "http"): if parsed.scheme not in ("", "http", "https"):
# Not a valid url # Not a valid url
return None return None
@@ -248,9 +260,9 @@ class HTMLDownloadHandler(DownloadHandler):
same_host = False same_host = False
(img_host, img_path) = ResponseHandler.parse_uri(img_src) (img_host, img_path) = ResponseHandler.parse_uri(img_src)
message = "GET {path} HTTP/1.1\r\n".format(path=img_path) message = f"GET {img_path} HTTP/1.1\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n" message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "Host: {host}\r\n\r\n".format(host=host) message += f"Host: {img_host}\r\n\r\n"
message = message.encode(FORMAT) message = message.encode(FORMAT)
if same_host: if same_host:

View File

@@ -8,7 +8,7 @@ from httplib.httpsocket import HTTPSocket
def _get_start_line(client: HTTPSocket): def _get_start_line(client: HTTPSocket):
line = client.read_line().strip() line = client.read_line().strip()
split = list(filter(None, line.split(" "))) split = list(filter(None, line.split(" ", 2)))
if len(split) < 3: if len(split) < 3:
raise InvalidStatusLine(line) # TODO fix exception raise InvalidStatusLine(line) # TODO fix exception