Parse html with regex, fix small issues

This commit is contained in:
2021-03-27 23:41:28 +01:00
parent bbca6f603b
commit 4473d1bec9
7 changed files with 134 additions and 80 deletions

View File

@@ -3,39 +3,40 @@ from abc import ABC, abstractmethod
from typing import Dict, Tuple
from urllib.parse import urlparse
from client.httpclient import FORMAT, HTTPClient
from client.httpclient import HTTPClient
from httplib import parser
from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding
from httplib.httpsocket import FORMAT
from httplib.message import ClientMessage as Message
from httplib.retriever import PreambleRetriever
sockets: Dict[str, HTTPClient] = {}
def create(command: str, url: str, port):
def create(method: str, url: str, port):
"""
Create a corresponding Command instance of the specified HTTP `command` with the specified `url` and `port`.
@param command: The command type to create
Create a corresponding Command instance of the specified HTTP `method` with the specified `url` and `port`.
@param method: The command type to create
@param url: The url for the command
@param port: The port for the command
"""
uri = parser.get_uri(url)
if command == "GET":
if method == "GET":
return GetCommand(uri, port)
elif command == "HEAD":
elif method == "HEAD":
return HeadCommand(uri, port)
elif command == "POST":
elif method == "POST":
return PostCommand(uri, port)
elif command == "PUT":
elif method == "PUT":
return PutCommand(uri, port)
else:
raise ValueError()
raise ValueError("Unknown HTTP method")
class AbstractCommand(ABC):
"""
A class representing the command for sending an HTTP command.
A class representing the command for sending an HTTP request.
"""
uri: str
host: str
@@ -51,10 +52,15 @@ class AbstractCommand(ABC):
@property
@abstractmethod
def command(self):
def method(self):
pass
def execute(self, sub_request=False):
"""
Creates and sends the HTTP message for this Command.
@param sub_request: If this execution is in function of a prior command.
"""
self.sub_request = sub_request
(host, path) = self.parse_uri()
@@ -69,9 +75,10 @@ class AbstractCommand(ABC):
client.conn.connect((host, self.port))
sockets[host] = client
message = f"{self.command} {path} HTTP/1.1\r\n"
message = f"{self.method} {path} HTTP/1.1\r\n"
message += f"Host: {host}:{self.port}\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "Accept: */*\r\n"
message += "Accept-Encoding: identity\r\n"
encoded_msg = self._build_message(message)
logging.debug("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
@@ -81,8 +88,7 @@ class AbstractCommand(ABC):
logging.info("HTTP request sent, awaiting response...")
try:
retriever = PreambleRetriever(client)
self._await_response(client, retriever)
self._await_response(client)
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
return
@@ -95,7 +101,12 @@ class AbstractCommand(ABC):
if not sub_request:
client.close()
def _await_response(self, client, retriever):
def _await_response(self, client):
"""
Simple response method.
Receives the response and prints to stdout.
"""
while True:
line = client.read_line()
print(line, end="")
@@ -106,11 +117,15 @@ class AbstractCommand(ABC):
return (message + "\r\n").encode(FORMAT)
def parse_uri(self):
"""
Parses the URI and returns the hostname and path.
@return: A tuple of the hostname and path.
"""
parsed = urlparse(self.uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + self.uri)
parsed = urlparse("http://" + self.uri)
host = parsed.netloc
path = parsed.path
@@ -126,11 +141,11 @@ class AbstractCommand(ABC):
class AbstractWithBodyCommand(AbstractCommand, ABC):
"""
The building block for creating an HTTP message for an HTTP command with a body.
The building block for creating an HTTP message for an HTTP method with a body (POST and PUT).
"""
def _build_message(self, message: str) -> bytes:
body = input(f"Enter {self.command} data: ").encode(FORMAT)
body = input(f"Enter {self.method} data: ").encode(FORMAT)
print()
message += "Content-Type: text/plain\r\n"
@@ -145,29 +160,36 @@ class AbstractWithBodyCommand(AbstractCommand, ABC):
class HeadCommand(AbstractCommand):
"""
A Command for sending a `HEAD` message.
A Command for sending a `HEAD` request.
"""
@property
def command(self):
def method(self):
return "HEAD"
class GetCommand(AbstractCommand):
"""
A Command for sending a `GET` message.
A Command for sending a `GET` request.
"""
dir: str
def __init__(self, uri: str, port, dir=None):
def __init__(self, uri: str, port, directory=None):
super().__init__(uri, port)
self.dir = dir
self.dir = directory
self.filename = None
@property
def command(self):
def method(self):
return "GET"
def _get_preamble(self, retriever):
def _get_preamble(self, client):
"""
Returns the preamble (start-line and headers) of the response of this command.
@param client: the client object to retrieve from
@return: A Message object containing the HTTP-version, status code, status message, headers and buffer
"""
retriever = PreambleRetriever(client)
lines = retriever.retrieve()
(version, status, msg) = parser.parse_status_line(next(lines))
headers = parser.parse_headers(lines)
@@ -177,8 +199,11 @@ class GetCommand(AbstractCommand):
return Message(version, status, msg, headers, buffer)
def _await_response(self, client, retriever):
msg = self._get_preamble(retriever)
def _await_response(self, client):
"""
Handles the response of this command.
"""
msg = self._get_preamble(client)
from client import response_handler
self.filename = response_handler.handle(client, msg, self, self.dir)
@@ -186,19 +211,19 @@ class GetCommand(AbstractCommand):
class PostCommand(AbstractWithBodyCommand):
"""
A command for sending a `POST` command.
A command for sending a `POST` request.
"""
@property
def command(self):
def method(self):
return "POST"
class PutCommand(AbstractWithBodyCommand):
"""
A command for sending a `PUT` command.
A command for sending a `PUT` request.
"""
@property
def command(self):
def method(self):
return "PUT"

View File

@@ -2,11 +2,6 @@ import socket
from httplib.httpsocket import HTTPSocket, InvalidResponse
BUFSIZE = 4096
TIMEOUT = 3
FORMAT = "UTF-8"
MAXLINE = 4096
class HTTPClient(HTTPSocket):
host: str

View File

@@ -4,15 +4,17 @@ import re
from abc import ABC, abstractmethod
from urllib.parse import urlsplit, unquote
from bs4 import BeautifulSoup, Tag
from client.command import AbstractCommand, GetCommand
from client.httpclient import HTTPClient, FORMAT
from client.httpclient import HTTPClient
from httplib import parser
from httplib.exceptions import InvalidResponse
from httplib.httpsocket import FORMAT
from httplib.message import ClientMessage as Message
from httplib.retriever import Retriever
BASE_REGEX = re.compile(r"<\s*base.*href\s*=\s*['\"](\S*)['\"][^>]*>", re.M | re.I)
IMG_REGEX = re.compile(r"<\s*img[^>]*\ssrc\s*=\s*['\"]([^\"']+)['\"][^>]*>", re.M | re.I)
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
handler = BasicResponseHandler(client, msg, command)
@@ -83,8 +85,10 @@ class BasicResponseHandler(ResponseHandler):
if 300 <= self.msg.status < 400:
# Redirect
self._skip_body()
return self._do_handle_redirect()
if 400 <= self.msg.status < 600:
self._skip_body()
# Dump headers and exit with error
if not self.cmd.sub_request:
print("".join(self.msg.raw), end="")
@@ -93,8 +97,6 @@ class BasicResponseHandler(ResponseHandler):
return None
def _do_handle_redirect(self):
self._skip_body()
if self.msg.status == 304:
print("".join(self.msg.raw), end="")
return None
@@ -203,40 +205,61 @@ class HTMLDownloadHandler(DownloadHandler):
file.write(buffer)
file.close()
self._download_images(tmp_path, self.path)
charset = parser.get_charset(self.msg.headers)
self._download_images(tmp_path, self.path, charset)
os.remove(tmp_path)
return self.path
def _download_images(self, tmp_filename, target_filename):
def _download_images(self, tmp_filename, target_filename, charset=FORMAT):
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'lxml')
try:
fp = open(tmp_filename, "r", encoding=charset)
html = fp.read()
except UnicodeDecodeError:
fp = open(tmp_filename, "r", encoding=FORMAT, errors="replace")
html = fp.read()
base_element = soup.find("base")
fp.close()
base_element = BASE_REGEX.search(html)
base_url = self.cmd.uri
if base_element:
base_url = parser.urljoin(self.cmd.uri, base_element["href"])
base_url = parser.urljoin(self.cmd.uri, base_element.group(1))
processed = {}
tag: Tag
for tag in soup.find_all("img"):
to_replace = []
for m in IMG_REGEX.finditer(html):
url_start = m.start(1)
url_end = m.end(1)
target = m.group(1)
try:
if not tag.has_attr("src"):
if len(target) == 0:
continue
if target in processed:
new_url = processed.get(target)
else:
new_url = self.__download_image(target, base_url)
if not new_url:
# Image failed to download
continue
if tag["src"] in processed:
new_url = processed.get(tag["src"])
else:
new_url = self.__download_image(tag["src"], base_url)
processed[tag["src"]] = new_url
if new_url:
tag["src"] = os.path.basename(new_url)
except Exception as e:
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
processed[target] = new_url
with open(target_filename, 'w') as file:
file.write(soup.prettify(formatter="minimal"))
if new_url:
local_path = os.path.basename(new_url)
to_replace.append((url_start, url_end, local_path))
except Exception as e:
logging.error("Failed to download image: %s, skipping...", target, exc_info=e)
to_replace.reverse()
for (start, end, path) in to_replace:
html = html[:start] + path + html[end:]
with open(target_filename, 'w', encoding=FORMAT) as file:
file.write(html)
def __download_image(self, img_src, base_url):
"""

View File

@@ -1,9 +1,11 @@
import logging
import re
import urllib
from typing import Dict
from urllib.parse import urlparse, urlsplit
from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest, InvalidRequestLine
from httplib.httpsocket import FORMAT
def _is_valid_http_version(http_version: str):
@@ -164,7 +166,7 @@ def get_uri(url: str):
parsed = urlsplit(url)
result = f"http://{parsed.netloc}{parsed.path}"
if parsed.query != '':
if parsed.query != "":
result = f"{result}?{parsed.query}"
return result
@@ -175,3 +177,13 @@ def urljoin(base, url):
Join a base url and a URL to form a absolute url.
"""
return urllib.parse.urljoin(base, url)
def get_charset(headers: Dict[str, str]):
if "content-type" in headers:
content_type = headers["content-type"]
match = re.search(r"charset\s*=\s*([a-z\-0-9]*)", content_type, re.I)
if match:
return match.group(1)
return FORMAT

View File

@@ -1,3 +1 @@
beautifulsoup4~=4.9.3
lxml~=4.6.2
cssutils~=2.2.0

View File

@@ -6,12 +6,12 @@ from datetime import datetime
from time import mktime
from wsgiref.handlers import format_date_time
from client.httpclient import FORMAT
from httplib import parser
from httplib.exceptions import NotFound, Forbidden, NotModified
from httplib.httpsocket import FORMAT
from httplib.message import ServerMessage as Message
root = os.path.join(os.path.dirname(sys.argv[0]), "public")
CONTENT_ROOT = os.path.join(os.path.dirname(sys.argv[0]), "public")
status_message = {
200: "OK",
@@ -26,6 +26,12 @@ status_message = {
def create(message: Message):
"""
Creates a Command based on the specified message
@param message: the message to create the Command with.
@return: An instance of `AbstractCommand`
"""
if message.method == "GET":
return GetCommand(message)
elif message.method == "HEAD":
@@ -102,9 +108,9 @@ class AbstractCommand(ABC):
norm_path = os.path.normpath(self.msg.target.path)
if norm_path == "/":
path = root + "/index.html"
path = CONTENT_ROOT + "/index.html"
else:
path = root + norm_path
path = CONTENT_ROOT + norm_path
if check and not os.path.exists(path):
raise NotFound(path)
@@ -169,7 +175,7 @@ class AbstractModifyCommand(AbstractCommand, ABC):
else:
status = 201
location = parser.urljoin("/", os.path.relpath(path, root))
location = parser.urljoin("/", os.path.relpath(path, CONTENT_ROOT))
return self._build_message(status, "text/plain", b"", {"Location": location})

View File

@@ -3,11 +3,6 @@ import socket
from httplib.exceptions import BadRequest
from httplib.httpsocket import HTTPSocket
BUFSIZE = 4096
TIMEOUT = 3
FORMAT = "UTF-8"
MAXLINE = 4096
class ServerSocket(HTTPSocket):