update
This commit is contained in:
@@ -40,6 +40,7 @@ class AbstractCommand(ABC):
|
||||
message = f"{self.command} {path} HTTP/1.1\r\n"
|
||||
message += f"Host: {host}\r\n"
|
||||
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
||||
message += "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0\r\n"
|
||||
encoded_msg = self._build_message(message)
|
||||
|
||||
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
|
||||
|
@@ -4,12 +4,13 @@ from abc import ABC, abstractmethod
|
||||
from typing import Dict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import cssutils
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from client.httpclient import HTTPClient, FORMAT
|
||||
from httplib.retriever import Retriever
|
||||
from httplib import parser
|
||||
from httplib.exceptions import InvalidResponse
|
||||
from httplib.retriever import Retriever
|
||||
|
||||
|
||||
class ResponseHandler(ABC):
|
||||
@@ -159,15 +160,15 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
file.write(buffer)
|
||||
file.close()
|
||||
|
||||
self.__download_images(tmp_path, self.path)
|
||||
self._download_images(tmp_path, self.path)
|
||||
os.remove(tmp_path)
|
||||
return self.path
|
||||
|
||||
def __download_images(self, tmp_filename, target_filename):
|
||||
def _download_images(self, tmp_filename, target_filename):
|
||||
|
||||
(host, path) = ResponseHandler.parse_uri(self.url)
|
||||
with open(tmp_filename, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, 'html.parser')
|
||||
soup = BeautifulSoup(fp, 'lxml')
|
||||
|
||||
base_url = self.url
|
||||
base_element = soup.find("base")
|
||||
@@ -175,13 +176,51 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
if base_element:
|
||||
base_url = base_element["href"]
|
||||
|
||||
processed = {}
|
||||
tag: Tag
|
||||
for tag in soup.find_all("img"):
|
||||
try:
|
||||
tag["src"] = self.__download_image(tag["src"], host, base_url)
|
||||
if tag["src"] in processed:
|
||||
new_url = processed.get(tag["src"])
|
||||
else:
|
||||
new_url = self.__download_image(tag["src"], host, base_url)
|
||||
processed[tag["src"]] = new_url
|
||||
if new_url:
|
||||
tag["src"] = new_url
|
||||
except Exception as e:
|
||||
logging.debug(e)
|
||||
logging.error("Failed to download image: %s, skipping...", tag["src"])
|
||||
|
||||
for tag in soup.find_all("div"):
|
||||
if not tag.has_attr("style"):
|
||||
continue
|
||||
style = cssutils.parseStyle(tag["style"])
|
||||
|
||||
if "background" in style and "url(" in style["background"]:
|
||||
el_name = "background"
|
||||
elif "background-image" in style and "url(" in style["background-image"]:
|
||||
el_name = "background-image"
|
||||
else:
|
||||
continue
|
||||
el = style[el_name]
|
||||
start = el.find("url(") + 4
|
||||
end = el.find(")", start)
|
||||
url = el[start:end].strip()
|
||||
|
||||
try:
|
||||
if url in processed:
|
||||
new_url = url
|
||||
else:
|
||||
new_url = self.__download_image(url, host, base_url)
|
||||
processed[url] = new_url
|
||||
if new_url:
|
||||
el = el[:start] + new_url + el[end:]
|
||||
style[el_name] = el
|
||||
tag["style"] = style.cssText
|
||||
except Exception as e:
|
||||
logging.debug("Internal error", exc_info=e)
|
||||
logging.error("Failed to download image: %s, skipping...", tag["src"])
|
||||
|
||||
with open(target_filename, 'w') as file:
|
||||
file.write(str(soup))
|
||||
|
||||
@@ -190,6 +229,10 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
|
||||
logging.debug("Downloading image: %s", img_src)
|
||||
|
||||
if parsed.scheme not in ("", "http"):
|
||||
# Not a valid url
|
||||
return None
|
||||
|
||||
if len(parsed.netloc) == 0 and parsed.path != "/":
|
||||
# relative url, append base_url
|
||||
img_src = os.path.join(os.path.dirname(base_url), parsed.path)
|
||||
|
Reference in New Issue
Block a user