small fixes

This commit is contained in:
2021-03-22 04:12:13 +01:00
parent 42f1661e0a
commit 9ba7a030a7
3 changed files with 37 additions and 14 deletions

View File

@@ -1,8 +1,9 @@
import logging
import os
import re
from abc import ABC, abstractmethod
from typing import Dict
from urllib.parse import urlparse
from urllib.parse import urlparse, unquote
import cssutils
from bs4 import BeautifulSoup, Tag
@@ -107,6 +108,11 @@ class DownloadHandler(ResponseHandler, ABC):
elif parsed.path[-1] != "/":
filename = parsed.path[index:]
while "%" in filename:
filename = unquote(filename)
filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
result = os.path.basename(filename).strip()
if any(letter.isalnum() for letter in result):
return result
@@ -152,7 +158,7 @@ class HTMLDownloadHandler(DownloadHandler):
def handle(self) -> str:
(dir, file) = os.path.split(self.path)
tmp_filename = ".{file}.tmp".format(file=file)
tmp_filename = f".{file}.tmp"
tmp_path = os.path.join(dir, tmp_filename)
file = open(tmp_path, "wb")
@@ -180,16 +186,22 @@ class HTMLDownloadHandler(DownloadHandler):
tag: Tag
for tag in soup.find_all("img"):
try:
if tag["src"] in processed:
new_url = processed.get(tag["src"])
if tag.has_attr("src"):
el_name = "src"
elif tag.has_attr("data-src"):
el_name = "data-src"
else:
new_url = self.__download_image(tag["src"], host, base_url)
processed[tag["src"]] = new_url
continue
if tag[el_name] in processed:
new_url = processed.get(tag[el_name])
else:
new_url = self.__download_image(tag[el_name], host, base_url)
processed[tag[el_name]] = new_url
if new_url:
tag["src"] = new_url
tag[el_name] = new_url
except Exception as e:
logging.debug(e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e)
for tag in soup.find_all("div"):
if not tag.has_attr("style"):
@@ -229,7 +241,7 @@ class HTMLDownloadHandler(DownloadHandler):
logging.debug("Downloading image: %s", img_src)
if parsed.scheme not in ("", "http"):
if parsed.scheme not in ("", "http", "https"):
# Not a valid url
return None
@@ -248,9 +260,9 @@ class HTMLDownloadHandler(DownloadHandler):
same_host = False
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
message = f"GET {img_path} HTTP/1.1\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "Host: {host}\r\n\r\n".format(host=host)
message += f"Host: {img_host}\r\n\r\n"
message = message.encode(FORMAT)
if same_host: