update
This commit is contained in:
@@ -14,7 +14,7 @@ from httplib.message import ClientMessage as Message
|
||||
from httplib.retriever import Retriever
|
||||
|
||||
|
||||
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None):
|
||||
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
|
||||
handler = BasicResponseHandler(client, msg, command)
|
||||
retriever = handler.handle()
|
||||
|
||||
@@ -23,9 +23,9 @@ def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None)
|
||||
|
||||
content_type = msg.headers.get("content-type")
|
||||
if content_type and "text/html" in content_type:
|
||||
handler = HTMLDownloadHandler(retriever, client, msg, command, dir)
|
||||
handler = HTMLDownloadHandler(retriever, client, msg, command, directory)
|
||||
else:
|
||||
handler = RawDownloadHandler(retriever, client, msg, command, dir)
|
||||
handler = RawDownloadHandler(retriever, client, msg, command, directory)
|
||||
|
||||
return handler.handle()
|
||||
|
||||
@@ -130,20 +130,20 @@ class BasicResponseHandler(ResponseHandler):
|
||||
|
||||
class DownloadHandler(ResponseHandler, ABC):
|
||||
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, directory=None):
|
||||
super().__init__(retriever, client, msg, cmd)
|
||||
|
||||
if not dir:
|
||||
dir = self._create_directory()
|
||||
if not directory:
|
||||
directory = self._create_directory()
|
||||
|
||||
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
|
||||
self.path = self._get_duplicate_name(os.path.join(directory, self.get_filename()))
|
||||
|
||||
@staticmethod
|
||||
def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
|
||||
def create(retriever: Retriever, client: HTTPClient, msg, cmd, directory=None):
|
||||
content_type = msg.headers.get("content-type")
|
||||
if content_type and "text/html" in content_type:
|
||||
return HTMLDownloadHandler(retriever, client, msg, cmd, dir)
|
||||
return RawDownloadHandler(retriever, client, msg, cmd, dir)
|
||||
return HTMLDownloadHandler(retriever, client, msg, cmd, directory)
|
||||
return RawDownloadHandler(retriever, client, msg, cmd, directory)
|
||||
|
||||
def _create_directory(self):
|
||||
path = self._get_duplicate_name(os.path.abspath(self.client.host))
|
||||
@@ -194,14 +194,14 @@ class RawDownloadHandler(DownloadHandler):
|
||||
|
||||
|
||||
class HTMLDownloadHandler(DownloadHandler):
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
|
||||
super().__init__(retriever, client, msg, cmd, dir)
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, directory=None):
|
||||
super().__init__(retriever, client, msg, cmd, directory)
|
||||
|
||||
def handle(self) -> str:
|
||||
|
||||
(dir, file) = os.path.split(self.path)
|
||||
(directory, file) = os.path.split(self.path)
|
||||
tmp_filename = f".{file}.tmp"
|
||||
tmp_path = os.path.join(dir, tmp_filename)
|
||||
tmp_path = os.path.join(directory, tmp_filename)
|
||||
file = open(tmp_path, "wb")
|
||||
|
||||
for buffer in self.retriever.retrieve():
|
||||
@@ -217,11 +217,11 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
with open(tmp_filename, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, 'lxml')
|
||||
|
||||
base_url = parser.base_url(self.cmd.uri)
|
||||
base_element = soup.find("base")
|
||||
|
||||
base_url = self.cmd.uri
|
||||
if base_element:
|
||||
base_url = f"http://{self.cmd.host}" + base_element["href"]
|
||||
base_url = parser.urljoin(self.cmd.uri, base_element["href"])
|
||||
|
||||
processed = {}
|
||||
tag: Tag
|
||||
@@ -241,22 +241,18 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
|
||||
|
||||
with open(target_filename, 'w') as file:
|
||||
file.write(str(soup))
|
||||
file.write(soup.prettify(formatter="minimal"))
|
||||
|
||||
def __download_image(self, img_src, base_url):
|
||||
"""
|
||||
Download image from the specified `img_src` and `base_url`.
|
||||
If the image is available, it will be downloaded to the directory of `self.path`
|
||||
"""
|
||||
|
||||
logging.info("Downloading image: %s", img_src)
|
||||
|
||||
parsed = urlsplit(img_src)
|
||||
|
||||
if parsed.scheme not in ("", "http", "https"):
|
||||
# Not a valid url
|
||||
return None
|
||||
|
||||
if parsed.hostname is None:
|
||||
if img_src[0] == "/":
|
||||
img_src = f"http://{self.cmd.host}{img_src}"
|
||||
else:
|
||||
img_src = parser.absolute_url(base_url, img_src)
|
||||
img_src = parser.urljoin(base_url, img_src)
|
||||
|
||||
if parsed.hostname is None or parsed.hostname == self.cmd.host:
|
||||
port = self.cmd.port
|
||||
|
Reference in New Issue
Block a user