This commit is contained in:
2021-03-22 02:41:49 +01:00
parent d25d2ef993
commit 42f1661e0a
10 changed files with 172 additions and 54 deletions

View File

@@ -40,6 +40,7 @@ class AbstractCommand(ABC):
message = f"{self.command} {path} HTTP/1.1\r\n"
message += f"Host: {host}\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0\r\n"
encoded_msg = self._build_message(message)
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))

View File

@@ -4,12 +4,13 @@ from abc import ABC, abstractmethod
from typing import Dict
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import cssutils
from bs4 import BeautifulSoup, Tag
from client.httpclient import HTTPClient, FORMAT
from httplib.retriever import Retriever
from httplib import parser
from httplib.exceptions import InvalidResponse
from httplib.retriever import Retriever
class ResponseHandler(ABC):
@@ -159,15 +160,15 @@ class HTMLDownloadHandler(DownloadHandler):
file.write(buffer)
file.close()
self.__download_images(tmp_path, self.path)
self._download_images(tmp_path, self.path)
os.remove(tmp_path)
return self.path
def __download_images(self, tmp_filename, target_filename):
def _download_images(self, tmp_filename, target_filename):
(host, path) = ResponseHandler.parse_uri(self.url)
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'html.parser')
soup = BeautifulSoup(fp, 'lxml')
base_url = self.url
base_element = soup.find("base")
@@ -175,13 +176,51 @@ class HTMLDownloadHandler(DownloadHandler):
if base_element:
base_url = base_element["href"]
processed = {}
tag: Tag
for tag in soup.find_all("img"):
try:
tag["src"] = self.__download_image(tag["src"], host, base_url)
if tag["src"] in processed:
new_url = processed.get(tag["src"])
else:
new_url = self.__download_image(tag["src"], host, base_url)
processed[tag["src"]] = new_url
if new_url:
tag["src"] = new_url
except Exception as e:
logging.debug(e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
for tag in soup.find_all("div"):
if not tag.has_attr("style"):
continue
style = cssutils.parseStyle(tag["style"])
if "background" in style and "url(" in style["background"]:
el_name = "background"
elif "background-image" in style and "url(" in style["background-image"]:
el_name = "background-image"
else:
continue
el = style[el_name]
start = el.find("url(") + 4
end = el.find(")", start)
url = el[start:end].strip()
try:
if url in processed:
new_url = url
else:
new_url = self.__download_image(url, host, base_url)
processed[url] = new_url
if new_url:
el = el[:start] + new_url + el[end:]
style[el_name] = el
tag["style"] = style.cssText
except Exception as e:
logging.debug("Internal error", exc_info=e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
with open(target_filename, 'w') as file:
file.write(str(soup))
@@ -190,6 +229,10 @@ class HTMLDownloadHandler(DownloadHandler):
logging.debug("Downloading image: %s", img_src)
if parsed.scheme not in ("", "http"):
# Not a valid url
return None
if len(parsed.netloc) == 0 and parsed.path != "/":
# relative url, append base_url
img_src = os.path.join(os.path.dirname(base_url), parsed.path)