Parse html with regex, fix small issues
This commit is contained in:
@@ -4,15 +4,17 @@ import re
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlsplit, unquote
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from client.command import AbstractCommand, GetCommand
|
||||
from client.httpclient import HTTPClient, FORMAT
|
||||
from client.httpclient import HTTPClient
|
||||
from httplib import parser
|
||||
from httplib.exceptions import InvalidResponse
|
||||
from httplib.httpsocket import FORMAT
|
||||
from httplib.message import ClientMessage as Message
|
||||
from httplib.retriever import Retriever
|
||||
|
||||
BASE_REGEX = re.compile(r"<\s*base.*href\s*=\s*['\"](\S*)['\"][^>]*>", re.M | re.I)
|
||||
IMG_REGEX = re.compile(r"<\s*img[^>]*\ssrc\s*=\s*['\"]([^\"']+)['\"][^>]*>", re.M | re.I)
|
||||
|
||||
|
||||
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
|
||||
handler = BasicResponseHandler(client, msg, command)
|
||||
@@ -83,8 +85,10 @@ class BasicResponseHandler(ResponseHandler):
|
||||
|
||||
if 300 <= self.msg.status < 400:
|
||||
# Redirect
|
||||
self._skip_body()
|
||||
return self._do_handle_redirect()
|
||||
if 400 <= self.msg.status < 600:
|
||||
self._skip_body()
|
||||
# Dump headers and exit with error
|
||||
if not self.cmd.sub_request:
|
||||
print("".join(self.msg.raw), end="")
|
||||
@@ -93,8 +97,6 @@ class BasicResponseHandler(ResponseHandler):
|
||||
return None
|
||||
|
||||
def _do_handle_redirect(self):
|
||||
self._skip_body()
|
||||
|
||||
if self.msg.status == 304:
|
||||
print("".join(self.msg.raw), end="")
|
||||
return None
|
||||
@@ -203,40 +205,61 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
file.write(buffer)
|
||||
file.close()
|
||||
|
||||
self._download_images(tmp_path, self.path)
|
||||
charset = parser.get_charset(self.msg.headers)
|
||||
self._download_images(tmp_path, self.path, charset)
|
||||
os.remove(tmp_path)
|
||||
return self.path
|
||||
|
||||
def _download_images(self, tmp_filename, target_filename):
|
||||
def _download_images(self, tmp_filename, target_filename, charset=FORMAT):
|
||||
|
||||
with open(tmp_filename, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, 'lxml')
|
||||
try:
|
||||
fp = open(tmp_filename, "r", encoding=charset)
|
||||
html = fp.read()
|
||||
except UnicodeDecodeError:
|
||||
fp = open(tmp_filename, "r", encoding=FORMAT, errors="replace")
|
||||
html = fp.read()
|
||||
|
||||
base_element = soup.find("base")
|
||||
fp.close()
|
||||
|
||||
base_url = self.cmd.uri
|
||||
if base_element:
|
||||
base_url = parser.urljoin(self.cmd.uri, base_element["href"])
|
||||
base_element = BASE_REGEX.search(html)
|
||||
base_url = self.cmd.uri
|
||||
if base_element:
|
||||
base_url = parser.urljoin(self.cmd.uri, base_element.group(1))
|
||||
|
||||
processed = {}
|
||||
tag: Tag
|
||||
for tag in soup.find_all("img"):
|
||||
try:
|
||||
if not tag.has_attr("src"):
|
||||
processed = {}
|
||||
to_replace = []
|
||||
|
||||
for m in IMG_REGEX.finditer(html):
|
||||
url_start = m.start(1)
|
||||
url_end = m.end(1)
|
||||
target = m.group(1)
|
||||
|
||||
try:
|
||||
if len(target) == 0:
|
||||
continue
|
||||
if target in processed:
|
||||
new_url = processed.get(target)
|
||||
else:
|
||||
new_url = self.__download_image(target, base_url)
|
||||
if not new_url:
|
||||
# Image failed to download
|
||||
continue
|
||||
|
||||
if tag["src"] in processed:
|
||||
new_url = processed.get(tag["src"])
|
||||
else:
|
||||
new_url = self.__download_image(tag["src"], base_url)
|
||||
processed[tag["src"]] = new_url
|
||||
if new_url:
|
||||
tag["src"] = os.path.basename(new_url)
|
||||
except Exception as e:
|
||||
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
|
||||
processed[target] = new_url
|
||||
|
||||
with open(target_filename, 'w') as file:
|
||||
file.write(soup.prettify(formatter="minimal"))
|
||||
if new_url:
|
||||
local_path = os.path.basename(new_url)
|
||||
to_replace.append((url_start, url_end, local_path))
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Failed to download image: %s, skipping...", target, exc_info=e)
|
||||
|
||||
to_replace.reverse()
|
||||
for (start, end, path) in to_replace:
|
||||
html = html[:start] + path + html[end:]
|
||||
|
||||
with open(target_filename, 'w', encoding=FORMAT) as file:
|
||||
file.write(html)
|
||||
|
||||
def __download_image(self, img_src, base_url):
|
||||
"""
|
||||
|
Reference in New Issue
Block a user