small fixes
This commit is contained in:
@@ -31,6 +31,15 @@ class AbstractCommand(ABC):
|
|||||||
else:
|
else:
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_message(command, host, path):
|
||||||
|
message = f"{command} {path} HTTP/1.1\r\n"
|
||||||
|
message += f"Host: {host}\r\n"
|
||||||
|
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
||||||
|
|
||||||
|
return message.encode(FORMAT)
|
||||||
|
|
||||||
def execute(self):
|
def execute(self):
|
||||||
(host, path) = self.parse_uri()
|
(host, path) = self.parse_uri()
|
||||||
|
|
||||||
@@ -40,7 +49,6 @@ class AbstractCommand(ABC):
|
|||||||
message = f"{self.command} {path} HTTP/1.1\r\n"
|
message = f"{self.command} {path} HTTP/1.1\r\n"
|
||||||
message += f"Host: {host}\r\n"
|
message += f"Host: {host}\r\n"
|
||||||
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
||||||
message += "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0\r\n"
|
|
||||||
encoded_msg = self._build_message(message)
|
encoded_msg = self._build_message(message)
|
||||||
|
|
||||||
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
|
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
|
||||||
@@ -94,6 +102,9 @@ class AbstractCommand(ABC):
|
|||||||
|
|
||||||
class AbstractWithBodyCommand(AbstractCommand, ABC):
|
class AbstractWithBodyCommand(AbstractCommand, ABC):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_message(command, host, path):
|
||||||
|
message = AbstractCommand.build_message()
|
||||||
def _build_message(self, message: str) -> bytes:
|
def _build_message(self, message: str) -> bytes:
|
||||||
body = input(f"Enter {self.command} data: ").encode(FORMAT)
|
body = input(f"Enter {self.command} data: ").encode(FORMAT)
|
||||||
print()
|
print()
|
||||||
|
@@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse, unquote
|
||||||
|
|
||||||
import cssutils
|
import cssutils
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
@@ -107,6 +108,11 @@ class DownloadHandler(ResponseHandler, ABC):
|
|||||||
elif parsed.path[-1] != "/":
|
elif parsed.path[-1] != "/":
|
||||||
filename = parsed.path[index:]
|
filename = parsed.path[index:]
|
||||||
|
|
||||||
|
while "%" in filename:
|
||||||
|
filename = unquote(filename)
|
||||||
|
|
||||||
|
filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
|
||||||
|
|
||||||
result = os.path.basename(filename).strip()
|
result = os.path.basename(filename).strip()
|
||||||
if any(letter.isalnum() for letter in result):
|
if any(letter.isalnum() for letter in result):
|
||||||
return result
|
return result
|
||||||
@@ -152,7 +158,7 @@ class HTMLDownloadHandler(DownloadHandler):
|
|||||||
def handle(self) -> str:
|
def handle(self) -> str:
|
||||||
|
|
||||||
(dir, file) = os.path.split(self.path)
|
(dir, file) = os.path.split(self.path)
|
||||||
tmp_filename = ".{file}.tmp".format(file=file)
|
tmp_filename = f".{file}.tmp"
|
||||||
tmp_path = os.path.join(dir, tmp_filename)
|
tmp_path = os.path.join(dir, tmp_filename)
|
||||||
file = open(tmp_path, "wb")
|
file = open(tmp_path, "wb")
|
||||||
|
|
||||||
@@ -180,16 +186,22 @@ class HTMLDownloadHandler(DownloadHandler):
|
|||||||
tag: Tag
|
tag: Tag
|
||||||
for tag in soup.find_all("img"):
|
for tag in soup.find_all("img"):
|
||||||
try:
|
try:
|
||||||
if tag["src"] in processed:
|
if tag.has_attr("src"):
|
||||||
new_url = processed.get(tag["src"])
|
el_name = "src"
|
||||||
|
elif tag.has_attr("data-src"):
|
||||||
|
el_name = "data-src"
|
||||||
else:
|
else:
|
||||||
new_url = self.__download_image(tag["src"], host, base_url)
|
continue
|
||||||
processed[tag["src"]] = new_url
|
|
||||||
|
if tag[el_name] in processed:
|
||||||
|
new_url = processed.get(tag[el_name])
|
||||||
|
else:
|
||||||
|
new_url = self.__download_image(tag[el_name], host, base_url)
|
||||||
|
processed[tag[el_name]] = new_url
|
||||||
if new_url:
|
if new_url:
|
||||||
tag["src"] = new_url
|
tag[el_name] = new_url
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.debug(e)
|
logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e)
|
||||||
logging.error("Failed to download image: %s, skipping...", tag["src"])
|
|
||||||
|
|
||||||
for tag in soup.find_all("div"):
|
for tag in soup.find_all("div"):
|
||||||
if not tag.has_attr("style"):
|
if not tag.has_attr("style"):
|
||||||
@@ -229,7 +241,7 @@ class HTMLDownloadHandler(DownloadHandler):
|
|||||||
|
|
||||||
logging.debug("Downloading image: %s", img_src)
|
logging.debug("Downloading image: %s", img_src)
|
||||||
|
|
||||||
if parsed.scheme not in ("", "http"):
|
if parsed.scheme not in ("", "http", "https"):
|
||||||
# Not a valid url
|
# Not a valid url
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -248,9 +260,9 @@ class HTMLDownloadHandler(DownloadHandler):
|
|||||||
same_host = False
|
same_host = False
|
||||||
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
|
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
|
||||||
|
|
||||||
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
|
message = f"GET {img_path} HTTP/1.1\r\n"
|
||||||
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
||||||
message += "Host: {host}\r\n\r\n".format(host=host)
|
message += f"Host: {img_host}\r\n\r\n"
|
||||||
message = message.encode(FORMAT)
|
message = message.encode(FORMAT)
|
||||||
|
|
||||||
if same_host:
|
if same_host:
|
||||||
|
@@ -8,7 +8,7 @@ from httplib.httpsocket import HTTPSocket
|
|||||||
|
|
||||||
def _get_start_line(client: HTTPSocket):
|
def _get_start_line(client: HTTPSocket):
|
||||||
line = client.read_line().strip()
|
line = client.read_line().strip()
|
||||||
split = list(filter(None, line.split(" ")))
|
split = list(filter(None, line.split(" ", 2)))
|
||||||
if len(split) < 3:
|
if len(split) < 3:
|
||||||
raise InvalidStatusLine(line) # TODO fix exception
|
raise InvalidStatusLine(line) # TODO fix exception
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user