Parse html with regex, fix small issues

This commit is contained in:
2021-03-27 23:41:28 +01:00
parent bbca6f603b
commit 4473d1bec9
7 changed files with 134 additions and 80 deletions

View File

@@ -3,39 +3,40 @@ from abc import ABC, abstractmethod
from typing import Dict, Tuple from typing import Dict, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
from client.httpclient import FORMAT, HTTPClient from client.httpclient import HTTPClient
from httplib import parser from httplib import parser
from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding
from httplib.httpsocket import FORMAT
from httplib.message import ClientMessage as Message from httplib.message import ClientMessage as Message
from httplib.retriever import PreambleRetriever from httplib.retriever import PreambleRetriever
sockets: Dict[str, HTTPClient] = {} sockets: Dict[str, HTTPClient] = {}
def create(command: str, url: str, port): def create(method: str, url: str, port):
""" """
Create a corresponding Command instance of the specified HTTP `command` with the specified `url` and `port`. Create a corresponding Command instance of the specified HTTP `method` with the specified `url` and `port`.
@param command: The command type to create @param method: The command type to create
@param url: The url for the command @param url: The url for the command
@param port: The port for the command @param port: The port for the command
""" """
uri = parser.get_uri(url) uri = parser.get_uri(url)
if command == "GET": if method == "GET":
return GetCommand(uri, port) return GetCommand(uri, port)
elif command == "HEAD": elif method == "HEAD":
return HeadCommand(uri, port) return HeadCommand(uri, port)
elif command == "POST": elif method == "POST":
return PostCommand(uri, port) return PostCommand(uri, port)
elif command == "PUT": elif method == "PUT":
return PutCommand(uri, port) return PutCommand(uri, port)
else: else:
raise ValueError() raise ValueError("Unknown HTTP method")
class AbstractCommand(ABC): class AbstractCommand(ABC):
""" """
A class representing the command for sending an HTTP command. A class representing the command for sending an HTTP request.
""" """
uri: str uri: str
host: str host: str
@@ -51,10 +52,15 @@ class AbstractCommand(ABC):
@property @property
@abstractmethod @abstractmethod
def command(self): def method(self):
pass pass
def execute(self, sub_request=False): def execute(self, sub_request=False):
"""
Creates and sends the HTTP message for this Command.
@param sub_request: If this execution is in function of a prior command.
"""
self.sub_request = sub_request self.sub_request = sub_request
(host, path) = self.parse_uri() (host, path) = self.parse_uri()
@@ -69,9 +75,10 @@ class AbstractCommand(ABC):
client.conn.connect((host, self.port)) client.conn.connect((host, self.port))
sockets[host] = client sockets[host] = client
message = f"{self.command} {path} HTTP/1.1\r\n" message = f"{self.method} {path} HTTP/1.1\r\n"
message += f"Host: {host}:{self.port}\r\n" message += f"Host: {host}:{self.port}\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n" message += "Accept: */*\r\n"
message += "Accept-Encoding: identity\r\n"
encoded_msg = self._build_message(message) encoded_msg = self._build_message(message)
logging.debug("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT)) logging.debug("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
@@ -81,8 +88,7 @@ class AbstractCommand(ABC):
logging.info("HTTP request sent, awaiting response...") logging.info("HTTP request sent, awaiting response...")
try: try:
retriever = PreambleRetriever(client) self._await_response(client)
self._await_response(client, retriever)
except InvalidResponse as e: except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e) logging.debug("Internal error: Response could not be parsed", exc_info=e)
return return
@@ -95,7 +101,12 @@ class AbstractCommand(ABC):
if not sub_request: if not sub_request:
client.close() client.close()
def _await_response(self, client, retriever): def _await_response(self, client):
"""
Simple response method.
Receives the response and prints to stdout.
"""
while True: while True:
line = client.read_line() line = client.read_line()
print(line, end="") print(line, end="")
@@ -106,11 +117,15 @@ class AbstractCommand(ABC):
return (message + "\r\n").encode(FORMAT) return (message + "\r\n").encode(FORMAT)
def parse_uri(self): def parse_uri(self):
"""
Parses the URI and returns the hostname and path.
@return: A tuple of the hostname and path.
"""
parsed = urlparse(self.uri) parsed = urlparse(self.uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again # If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "": if parsed.netloc == "":
parsed = urlparse("//" + self.uri) parsed = urlparse("http://" + self.uri)
host = parsed.netloc host = parsed.netloc
path = parsed.path path = parsed.path
@@ -126,11 +141,11 @@ class AbstractCommand(ABC):
class AbstractWithBodyCommand(AbstractCommand, ABC): class AbstractWithBodyCommand(AbstractCommand, ABC):
""" """
The building block for creating an HTTP message for an HTTP command with a body. The building block for creating an HTTP message for an HTTP method with a body (POST and PUT).
""" """
def _build_message(self, message: str) -> bytes: def _build_message(self, message: str) -> bytes:
body = input(f"Enter {self.command} data: ").encode(FORMAT) body = input(f"Enter {self.method} data: ").encode(FORMAT)
print() print()
message += "Content-Type: text/plain\r\n" message += "Content-Type: text/plain\r\n"
@@ -145,29 +160,36 @@ class AbstractWithBodyCommand(AbstractCommand, ABC):
class HeadCommand(AbstractCommand): class HeadCommand(AbstractCommand):
""" """
A Command for sending a `HEAD` message. A Command for sending a `HEAD` request.
""" """
@property @property
def command(self): def method(self):
return "HEAD" return "HEAD"
class GetCommand(AbstractCommand): class GetCommand(AbstractCommand):
""" """
A Command for sending a `GET` message. A Command for sending a `GET` request.
""" """
dir: str
def __init__(self, uri: str, port, dir=None): def __init__(self, uri: str, port, directory=None):
super().__init__(uri, port) super().__init__(uri, port)
self.dir = dir self.dir = directory
self.filename = None self.filename = None
@property @property
def command(self): def method(self):
return "GET" return "GET"
def _get_preamble(self, retriever): def _get_preamble(self, client):
"""
Returns the preamble (start-line and headers) of the response of this command.
@param client: the client object to retrieve from
@return: A Message object containing the HTTP-version, status code, status message, headers and buffer
"""
retriever = PreambleRetriever(client)
lines = retriever.retrieve() lines = retriever.retrieve()
(version, status, msg) = parser.parse_status_line(next(lines)) (version, status, msg) = parser.parse_status_line(next(lines))
headers = parser.parse_headers(lines) headers = parser.parse_headers(lines)
@@ -177,8 +199,11 @@ class GetCommand(AbstractCommand):
return Message(version, status, msg, headers, buffer) return Message(version, status, msg, headers, buffer)
def _await_response(self, client, retriever): def _await_response(self, client):
msg = self._get_preamble(retriever) """
Handles the response of this command.
"""
msg = self._get_preamble(client)
from client import response_handler from client import response_handler
self.filename = response_handler.handle(client, msg, self, self.dir) self.filename = response_handler.handle(client, msg, self, self.dir)
@@ -186,19 +211,19 @@ class GetCommand(AbstractCommand):
class PostCommand(AbstractWithBodyCommand): class PostCommand(AbstractWithBodyCommand):
""" """
A command for sending a `POST` command. A command for sending a `POST` request.
""" """
@property @property
def command(self): def method(self):
return "POST" return "POST"
class PutCommand(AbstractWithBodyCommand): class PutCommand(AbstractWithBodyCommand):
""" """
A command for sending a `PUT` command. A command for sending a `PUT` request.
""" """
@property @property
def command(self): def method(self):
return "PUT" return "PUT"

View File

@@ -2,11 +2,6 @@ import socket
from httplib.httpsocket import HTTPSocket, InvalidResponse from httplib.httpsocket import HTTPSocket, InvalidResponse
BUFSIZE = 4096
TIMEOUT = 3
FORMAT = "UTF-8"
MAXLINE = 4096
class HTTPClient(HTTPSocket): class HTTPClient(HTTPSocket):
host: str host: str

View File

@@ -4,15 +4,17 @@ import re
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from urllib.parse import urlsplit, unquote from urllib.parse import urlsplit, unquote
from bs4 import BeautifulSoup, Tag
from client.command import AbstractCommand, GetCommand from client.command import AbstractCommand, GetCommand
from client.httpclient import HTTPClient, FORMAT from client.httpclient import HTTPClient
from httplib import parser from httplib import parser
from httplib.exceptions import InvalidResponse from httplib.exceptions import InvalidResponse
from httplib.httpsocket import FORMAT
from httplib.message import ClientMessage as Message from httplib.message import ClientMessage as Message
from httplib.retriever import Retriever from httplib.retriever import Retriever
BASE_REGEX = re.compile(r"<\s*base.*href\s*=\s*['\"](\S*)['\"][^>]*>", re.M | re.I)
IMG_REGEX = re.compile(r"<\s*img[^>]*\ssrc\s*=\s*['\"]([^\"']+)['\"][^>]*>", re.M | re.I)
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None): def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
handler = BasicResponseHandler(client, msg, command) handler = BasicResponseHandler(client, msg, command)
@@ -83,8 +85,10 @@ class BasicResponseHandler(ResponseHandler):
if 300 <= self.msg.status < 400: if 300 <= self.msg.status < 400:
# Redirect # Redirect
self._skip_body()
return self._do_handle_redirect() return self._do_handle_redirect()
if 400 <= self.msg.status < 600: if 400 <= self.msg.status < 600:
self._skip_body()
# Dump headers and exit with error # Dump headers and exit with error
if not self.cmd.sub_request: if not self.cmd.sub_request:
print("".join(self.msg.raw), end="") print("".join(self.msg.raw), end="")
@@ -93,8 +97,6 @@ class BasicResponseHandler(ResponseHandler):
return None return None
def _do_handle_redirect(self): def _do_handle_redirect(self):
self._skip_body()
if self.msg.status == 304: if self.msg.status == 304:
print("".join(self.msg.raw), end="") print("".join(self.msg.raw), end="")
return None return None
@@ -203,40 +205,61 @@ class HTMLDownloadHandler(DownloadHandler):
file.write(buffer) file.write(buffer)
file.close() file.close()
self._download_images(tmp_path, self.path) charset = parser.get_charset(self.msg.headers)
self._download_images(tmp_path, self.path, charset)
os.remove(tmp_path) os.remove(tmp_path)
return self.path return self.path
def _download_images(self, tmp_filename, target_filename): def _download_images(self, tmp_filename, target_filename, charset=FORMAT):
with open(tmp_filename, "rb") as fp: try:
soup = BeautifulSoup(fp, 'lxml') fp = open(tmp_filename, "r", encoding=charset)
html = fp.read()
except UnicodeDecodeError:
fp = open(tmp_filename, "r", encoding=FORMAT, errors="replace")
html = fp.read()
base_element = soup.find("base") fp.close()
base_element = BASE_REGEX.search(html)
base_url = self.cmd.uri base_url = self.cmd.uri
if base_element: if base_element:
base_url = parser.urljoin(self.cmd.uri, base_element["href"]) base_url = parser.urljoin(self.cmd.uri, base_element.group(1))
processed = {} processed = {}
tag: Tag to_replace = []
for tag in soup.find_all("img"):
for m in IMG_REGEX.finditer(html):
url_start = m.start(1)
url_end = m.end(1)
target = m.group(1)
try: try:
if not tag.has_attr("src"): if len(target) == 0:
continue
if target in processed:
new_url = processed.get(target)
else:
new_url = self.__download_image(target, base_url)
if not new_url:
# Image failed to download
continue continue
if tag["src"] in processed: processed[target] = new_url
new_url = processed.get(tag["src"])
else:
new_url = self.__download_image(tag["src"], base_url)
processed[tag["src"]] = new_url
if new_url:
tag["src"] = os.path.basename(new_url)
except Exception as e:
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
with open(target_filename, 'w') as file: if new_url:
file.write(soup.prettify(formatter="minimal")) local_path = os.path.basename(new_url)
to_replace.append((url_start, url_end, local_path))
except Exception as e:
logging.error("Failed to download image: %s, skipping...", target, exc_info=e)
to_replace.reverse()
for (start, end, path) in to_replace:
html = html[:start] + path + html[end:]
with open(target_filename, 'w', encoding=FORMAT) as file:
file.write(html)
def __download_image(self, img_src, base_url): def __download_image(self, img_src, base_url):
""" """

View File

@@ -1,9 +1,11 @@
import logging import logging
import re import re
import urllib import urllib
from typing import Dict
from urllib.parse import urlparse, urlsplit from urllib.parse import urlparse, urlsplit
from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest, InvalidRequestLine from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest, InvalidRequestLine
from httplib.httpsocket import FORMAT
def _is_valid_http_version(http_version: str): def _is_valid_http_version(http_version: str):
@@ -164,7 +166,7 @@ def get_uri(url: str):
parsed = urlsplit(url) parsed = urlsplit(url)
result = f"http://{parsed.netloc}{parsed.path}" result = f"http://{parsed.netloc}{parsed.path}"
if parsed.query != '': if parsed.query != "":
result = f"{result}?{parsed.query}" result = f"{result}?{parsed.query}"
return result return result
@@ -175,3 +177,13 @@ def urljoin(base, url):
Join a base url and a URL to form a absolute url. Join a base url and a URL to form a absolute url.
""" """
return urllib.parse.urljoin(base, url) return urllib.parse.urljoin(base, url)
def get_charset(headers: Dict[str, str]):
if "content-type" in headers:
content_type = headers["content-type"]
match = re.search(r"charset\s*=\s*([a-z\-0-9]*)", content_type, re.I)
if match:
return match.group(1)
return FORMAT

View File

@@ -1,3 +1 @@
beautifulsoup4~=4.9.3
lxml~=4.6.2 lxml~=4.6.2
cssutils~=2.2.0

View File

@@ -6,12 +6,12 @@ from datetime import datetime
from time import mktime from time import mktime
from wsgiref.handlers import format_date_time from wsgiref.handlers import format_date_time
from client.httpclient import FORMAT
from httplib import parser from httplib import parser
from httplib.exceptions import NotFound, Forbidden, NotModified from httplib.exceptions import NotFound, Forbidden, NotModified
from httplib.httpsocket import FORMAT
from httplib.message import ServerMessage as Message from httplib.message import ServerMessage as Message
root = os.path.join(os.path.dirname(sys.argv[0]), "public") CONTENT_ROOT = os.path.join(os.path.dirname(sys.argv[0]), "public")
status_message = { status_message = {
200: "OK", 200: "OK",
@@ -26,6 +26,12 @@ status_message = {
def create(message: Message): def create(message: Message):
"""
Creates a Command based on the specified message
@param message: the message to create the Command with.
@return: An instance of `AbstractCommand`
"""
if message.method == "GET": if message.method == "GET":
return GetCommand(message) return GetCommand(message)
elif message.method == "HEAD": elif message.method == "HEAD":
@@ -102,9 +108,9 @@ class AbstractCommand(ABC):
norm_path = os.path.normpath(self.msg.target.path) norm_path = os.path.normpath(self.msg.target.path)
if norm_path == "/": if norm_path == "/":
path = root + "/index.html" path = CONTENT_ROOT + "/index.html"
else: else:
path = root + norm_path path = CONTENT_ROOT + norm_path
if check and not os.path.exists(path): if check and not os.path.exists(path):
raise NotFound(path) raise NotFound(path)
@@ -169,7 +175,7 @@ class AbstractModifyCommand(AbstractCommand, ABC):
else: else:
status = 201 status = 201
location = parser.urljoin("/", os.path.relpath(path, root)) location = parser.urljoin("/", os.path.relpath(path, CONTENT_ROOT))
return self._build_message(status, "text/plain", b"", {"Location": location}) return self._build_message(status, "text/plain", b"", {"Location": location})

View File

@@ -3,11 +3,6 @@ import socket
from httplib.exceptions import BadRequest from httplib.exceptions import BadRequest
from httplib.httpsocket import HTTPSocket from httplib.httpsocket import HTTPSocket
BUFSIZE = 4096
TIMEOUT = 3
FORMAT = "UTF-8"
MAXLINE = 4096
class ServerSocket(HTTPSocket): class ServerSocket(HTTPSocket):