update

2021-03-22 02:41:49 +01:00
parent d25d2ef993
commit 42f1661e0a
10 changed files with 172 additions and 54 deletions
--- a/client/command.py
+++ b/client/command.py
@@ -40,6 +40,7 @@ class AbstractCommand(ABC):
        message = f"{self.command} {path} HTTP/1.1\r\n"
        message += f"Host: {host}\r\n"
        message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
+        message += "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0\r\n"
        encoded_msg = self._build_message(message)

        logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
--- a/client/response_handler.py
+++ b/client/response_handler.py
@@ -4,12 +4,13 @@ from abc import ABC, abstractmethod
 from typing import Dict
 from urllib.parse import urlparse

-from bs4 import BeautifulSoup
+import cssutils
+from bs4 import BeautifulSoup, Tag

 from client.httpclient import HTTPClient, FORMAT
-from httplib.retriever import Retriever
 from httplib import parser
 from httplib.exceptions import InvalidResponse
+from httplib.retriever import Retriever


 class ResponseHandler(ABC):
@@ -159,15 +160,15 @@ class HTMLDownloadHandler(DownloadHandler):
            file.write(buffer)
        file.close()

-        self.__download_images(tmp_path, self.path)
+        self._download_images(tmp_path, self.path)
        os.remove(tmp_path)
        return self.path

-    def __download_images(self, tmp_filename, target_filename):
+    def _download_images(self, tmp_filename, target_filename):

        (host, path) = ResponseHandler.parse_uri(self.url)
        with open(tmp_filename, "rb") as fp:
-            soup = BeautifulSoup(fp, 'html.parser')
+            soup = BeautifulSoup(fp, 'lxml')

            base_url = self.url
            base_element = soup.find("base")
@@ -175,13 +176,51 @@ class HTMLDownloadHandler(DownloadHandler):
            if base_element:
                base_url = base_element["href"]

+            processed = {}
+            tag: Tag
            for tag in soup.find_all("img"):
                try:
-                    tag["src"] = self.__download_image(tag["src"], host, base_url)
+                    if tag["src"] in processed:
+                        new_url = processed.get(tag["src"])
+                    else:
+                        new_url = self.__download_image(tag["src"], host, base_url)
+                        processed[tag["src"]] = new_url
+                    if new_url:
+                        tag["src"] = new_url
                except Exception as e:
                    logging.debug(e)
                    logging.error("Failed to download image: %s, skipping...", tag["src"])

+            for tag in soup.find_all("div"):
+                if not tag.has_attr("style"):
+                    continue
+                style = cssutils.parseStyle(tag["style"])
+
+                if "background" in style and "url(" in style["background"]:
+                    el_name = "background"
+                elif "background-image" in style and "url(" in style["background-image"]:
+                    el_name = "background-image"
+                else:
+                    continue
+                el = style[el_name]
+                start = el.find("url(") + 4
+                end = el.find(")", start)
+                url = el[start:end].strip()
+
+                try:
+                    if url in processed:
+                        new_url = url
+                    else:
+                        new_url = self.__download_image(url, host, base_url)
+                        processed[url] = new_url
+                    if new_url:
+                        el = el[:start] + new_url + el[end:]
+                        style[el_name] = el
+                        tag["style"] = style.cssText
+                except Exception as e:
+                    logging.debug("Internal error", exc_info=e)
+                    logging.error("Failed to download image: %s, skipping...", tag["src"])
+
        with open(target_filename, 'w') as file:
            file.write(str(soup))

@@ -190,6 +229,10 @@ class HTMLDownloadHandler(DownloadHandler):

        logging.debug("Downloading image: %s", img_src)

+        if parsed.scheme not in ("", "http"):
+            # Not a valid url
+            return None
+
        if len(parsed.netloc) == 0 and parsed.path != "/":
            # relative url, append base_url
            img_src = os.path.join(os.path.dirname(base_url), parsed.path)