small fixes

This commit is contained in:
2021-03-22 04:12:13 +01:00
parent 42f1661e0a
commit 9ba7a030a7
3 changed files with 37 additions and 14 deletions

View File

@@ -31,6 +31,15 @@ class AbstractCommand(ABC):
else:
raise ValueError()
@staticmethod
def build_message(command, host, path):
message = f"{command} {path} HTTP/1.1\r\n"
message += f"Host: {host}\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
return message.encode(FORMAT)
def execute(self):
(host, path) = self.parse_uri()
@@ -40,7 +49,6 @@ class AbstractCommand(ABC):
message = f"{self.command} {path} HTTP/1.1\r\n"
message += f"Host: {host}\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0\r\n"
encoded_msg = self._build_message(message)
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
@@ -94,6 +102,9 @@ class AbstractCommand(ABC):
class AbstractWithBodyCommand(AbstractCommand, ABC):
@staticmethod
def build_message(command, host, path):
message = AbstractCommand.build_message()
def _build_message(self, message: str) -> bytes:
body = input(f"Enter {self.command} data: ").encode(FORMAT)
print()

View File

@@ -1,8 +1,9 @@
import logging
import os
import re
from abc import ABC, abstractmethod
from typing import Dict
from urllib.parse import urlparse
from urllib.parse import urlparse, unquote
import cssutils
from bs4 import BeautifulSoup, Tag
@@ -107,6 +108,11 @@ class DownloadHandler(ResponseHandler, ABC):
elif parsed.path[-1] != "/":
filename = parsed.path[index:]
while "%" in filename:
filename = unquote(filename)
filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
result = os.path.basename(filename).strip()
if any(letter.isalnum() for letter in result):
return result
@@ -152,7 +158,7 @@ class HTMLDownloadHandler(DownloadHandler):
def handle(self) -> str:
(dir, file) = os.path.split(self.path)
tmp_filename = ".{file}.tmp".format(file=file)
tmp_filename = f".{file}.tmp"
tmp_path = os.path.join(dir, tmp_filename)
file = open(tmp_path, "wb")
@@ -180,16 +186,22 @@ class HTMLDownloadHandler(DownloadHandler):
tag: Tag
for tag in soup.find_all("img"):
try:
if tag["src"] in processed:
new_url = processed.get(tag["src"])
if tag.has_attr("src"):
el_name = "src"
elif tag.has_attr("data-src"):
el_name = "data-src"
else:
new_url = self.__download_image(tag["src"], host, base_url)
processed[tag["src"]] = new_url
continue
if tag[el_name] in processed:
new_url = processed.get(tag[el_name])
else:
new_url = self.__download_image(tag[el_name], host, base_url)
processed[tag[el_name]] = new_url
if new_url:
tag["src"] = new_url
tag[el_name] = new_url
except Exception as e:
logging.debug(e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e)
for tag in soup.find_all("div"):
if not tag.has_attr("style"):
@@ -229,7 +241,7 @@ class HTMLDownloadHandler(DownloadHandler):
logging.debug("Downloading image: %s", img_src)
if parsed.scheme not in ("", "http"):
if parsed.scheme not in ("", "http", "https"):
# Not a valid url
return None
@@ -248,9 +260,9 @@ class HTMLDownloadHandler(DownloadHandler):
same_host = False
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
message = f"GET {img_path} HTTP/1.1\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "Host: {host}\r\n\r\n".format(host=host)
message += f"Host: {img_host}\r\n\r\n"
message = message.encode(FORMAT)
if same_host:

View File

@@ -8,7 +8,7 @@ from httplib.httpsocket import HTTPSocket
def _get_start_line(client: HTTPSocket):
line = client.read_line().strip()
split = list(filter(None, line.split(" ")))
split = list(filter(None, line.split(" ", 2)))
if len(split) < 3:
raise InvalidStatusLine(line) # TODO fix exception