Parse html with regex, fix small issues

This commit is contained in:
2021-03-27 23:41:28 +01:00
parent bbca6f603b
commit 4473d1bec9
7 changed files with 134 additions and 80 deletions

View File

@@ -4,15 +4,17 @@ import re
from abc import ABC, abstractmethod
from urllib.parse import urlsplit, unquote
from bs4 import BeautifulSoup, Tag
from client.command import AbstractCommand, GetCommand
from client.httpclient import HTTPClient, FORMAT
from client.httpclient import HTTPClient
from httplib import parser
from httplib.exceptions import InvalidResponse
from httplib.httpsocket import FORMAT
from httplib.message import ClientMessage as Message
from httplib.retriever import Retriever
BASE_REGEX = re.compile(r"<\s*base.*href\s*=\s*['\"](\S*)['\"][^>]*>", re.M | re.I)
IMG_REGEX = re.compile(r"<\s*img[^>]*\ssrc\s*=\s*['\"]([^\"']+)['\"][^>]*>", re.M | re.I)
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
handler = BasicResponseHandler(client, msg, command)
@@ -83,8 +85,10 @@ class BasicResponseHandler(ResponseHandler):
if 300 <= self.msg.status < 400:
# Redirect
self._skip_body()
return self._do_handle_redirect()
if 400 <= self.msg.status < 600:
self._skip_body()
# Dump headers and exit with error
if not self.cmd.sub_request:
print("".join(self.msg.raw), end="")
@@ -93,8 +97,6 @@ class BasicResponseHandler(ResponseHandler):
return None
def _do_handle_redirect(self):
self._skip_body()
if self.msg.status == 304:
print("".join(self.msg.raw), end="")
return None
@@ -203,40 +205,61 @@ class HTMLDownloadHandler(DownloadHandler):
file.write(buffer)
file.close()
self._download_images(tmp_path, self.path)
charset = parser.get_charset(self.msg.headers)
self._download_images(tmp_path, self.path, charset)
os.remove(tmp_path)
return self.path
def _download_images(self, tmp_filename, target_filename):
def _download_images(self, tmp_filename, target_filename, charset=FORMAT):
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'lxml')
try:
fp = open(tmp_filename, "r", encoding=charset)
html = fp.read()
except UnicodeDecodeError:
fp = open(tmp_filename, "r", encoding=FORMAT, errors="replace")
html = fp.read()
base_element = soup.find("base")
fp.close()
base_url = self.cmd.uri
if base_element:
base_url = parser.urljoin(self.cmd.uri, base_element["href"])
base_element = BASE_REGEX.search(html)
base_url = self.cmd.uri
if base_element:
base_url = parser.urljoin(self.cmd.uri, base_element.group(1))
processed = {}
tag: Tag
for tag in soup.find_all("img"):
try:
if not tag.has_attr("src"):
processed = {}
to_replace = []
for m in IMG_REGEX.finditer(html):
url_start = m.start(1)
url_end = m.end(1)
target = m.group(1)
try:
if len(target) == 0:
continue
if target in processed:
new_url = processed.get(target)
else:
new_url = self.__download_image(target, base_url)
if not new_url:
# Image failed to download
continue
if tag["src"] in processed:
new_url = processed.get(tag["src"])
else:
new_url = self.__download_image(tag["src"], base_url)
processed[tag["src"]] = new_url
if new_url:
tag["src"] = os.path.basename(new_url)
except Exception as e:
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
processed[target] = new_url
with open(target_filename, 'w') as file:
file.write(soup.prettify(formatter="minimal"))
if new_url:
local_path = os.path.basename(new_url)
to_replace.append((url_start, url_end, local_path))
except Exception as e:
logging.error("Failed to download image: %s, skipping...", target, exc_info=e)
to_replace.reverse()
for (start, end, path) in to_replace:
html = html[:start] + path + html[end:]
with open(target_filename, 'w', encoding=FORMAT) as file:
file.write(html)
def __download_image(self, img_src, base_url):
"""