Parse html with regex, fix small issues

2021-03-27 23:41:28 +01:00
parent bbca6f603b
commit 4473d1bec9
7 changed files with 134 additions and 80 deletions
--- a/client/response_handler.py
+++ b/client/response_handler.py
@@ -4,15 +4,17 @@ import re
 from abc import ABC, abstractmethod
 from urllib.parse import urlsplit, unquote

-from bs4 import BeautifulSoup, Tag
-
 from client.command import AbstractCommand, GetCommand
-from client.httpclient import HTTPClient, FORMAT
+from client.httpclient import HTTPClient
 from httplib import parser
 from httplib.exceptions import InvalidResponse
+from httplib.httpsocket import FORMAT
 from httplib.message import ClientMessage as Message
 from httplib.retriever import Retriever

+BASE_REGEX = re.compile(r"<\s*base.*href\s*=\s*['\"](\S*)['\"][^>]*>", re.M | re.I)
+IMG_REGEX = re.compile(r"<\s*img[^>]*\ssrc\s*=\s*['\"]([^\"']+)['\"][^>]*>", re.M | re.I)
+

 def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
    handler = BasicResponseHandler(client, msg, command)
@@ -83,8 +85,10 @@ class BasicResponseHandler(ResponseHandler):

        if 300 <= self.msg.status < 400:
            # Redirect
+            self._skip_body()
            return self._do_handle_redirect()
        if 400 <= self.msg.status < 600:
+            self._skip_body()
            # Dump headers and exit with error
            if not self.cmd.sub_request:
                print("".join(self.msg.raw), end="")
@@ -93,8 +97,6 @@ class BasicResponseHandler(ResponseHandler):
        return None

    def _do_handle_redirect(self):
-        self._skip_body()
-
        if self.msg.status == 304:
            print("".join(self.msg.raw), end="")
            return None
@@ -203,40 +205,61 @@ class HTMLDownloadHandler(DownloadHandler):
            file.write(buffer)
        file.close()

-        self._download_images(tmp_path, self.path)
+        charset = parser.get_charset(self.msg.headers)
+        self._download_images(tmp_path, self.path, charset)
        os.remove(tmp_path)
        return self.path

-    def _download_images(self, tmp_filename, target_filename):
+    def _download_images(self, tmp_filename, target_filename, charset=FORMAT):

-        with open(tmp_filename, "rb") as fp:
-            soup = BeautifulSoup(fp, 'lxml')
+        try:
+            fp = open(tmp_filename, "r", encoding=charset)
+            html = fp.read()
+        except UnicodeDecodeError:
+            fp = open(tmp_filename, "r", encoding=FORMAT, errors="replace")
+            html = fp.read()

-            base_element = soup.find("base")
+        fp.close()

-            base_url = self.cmd.uri
-            if base_element:
-                base_url = parser.urljoin(self.cmd.uri, base_element["href"])
+        base_element = BASE_REGEX.search(html)
+        base_url = self.cmd.uri
+        if base_element:
+            base_url = parser.urljoin(self.cmd.uri, base_element.group(1))

-            processed = {}
-            tag: Tag
-            for tag in soup.find_all("img"):
-                try:
-                    if not tag.has_attr("src"):
+        processed = {}
+        to_replace = []
+
+        for m in IMG_REGEX.finditer(html):
+            url_start = m.start(1)
+            url_end = m.end(1)
+            target = m.group(1)
+
+            try:
+                if len(target) == 0:
+                    continue
+                if target in processed:
+                    new_url = processed.get(target)
+                else:
+                    new_url = self.__download_image(target, base_url)
+                    if not new_url:
+                        # Image failed to download
                        continue

-                    if tag["src"] in processed:
-                        new_url = processed.get(tag["src"])
-                    else:
-                        new_url = self.__download_image(tag["src"], base_url)
-                        processed[tag["src"]] = new_url
-                    if new_url:
-                        tag["src"] = os.path.basename(new_url)
-                except Exception as e:
-                    logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
+                    processed[target] = new_url

-        with open(target_filename, 'w') as file:
-            file.write(soup.prettify(formatter="minimal"))
+                if new_url:
+                    local_path = os.path.basename(new_url)
+                    to_replace.append((url_start, url_end, local_path))
+
+            except Exception as e:
+                logging.error("Failed to download image: %s, skipping...", target, exc_info=e)
+
+        to_replace.reverse()
+        for (start, end, path) in to_replace:
+            html = html[:start] + path + html[end:]
+
+        with open(target_filename, 'w', encoding=FORMAT) as file:
+            file.write(html)

    def __download_image(self, img_src, base_url):
        """