Parse html with regex, fix small issues
This commit is contained in:
@@ -3,39 +3,40 @@ from abc import ABC, abstractmethod
|
||||
from typing import Dict, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from client.httpclient import FORMAT, HTTPClient
|
||||
from client.httpclient import HTTPClient
|
||||
from httplib import parser
|
||||
from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding
|
||||
from httplib.httpsocket import FORMAT
|
||||
from httplib.message import ClientMessage as Message
|
||||
from httplib.retriever import PreambleRetriever
|
||||
|
||||
sockets: Dict[str, HTTPClient] = {}
|
||||
|
||||
|
||||
def create(command: str, url: str, port):
|
||||
def create(method: str, url: str, port):
|
||||
"""
|
||||
Create a corresponding Command instance of the specified HTTP `command` with the specified `url` and `port`.
|
||||
@param command: The command type to create
|
||||
Create a corresponding Command instance of the specified HTTP `method` with the specified `url` and `port`.
|
||||
@param method: The command type to create
|
||||
@param url: The url for the command
|
||||
@param port: The port for the command
|
||||
"""
|
||||
|
||||
uri = parser.get_uri(url)
|
||||
if command == "GET":
|
||||
if method == "GET":
|
||||
return GetCommand(uri, port)
|
||||
elif command == "HEAD":
|
||||
elif method == "HEAD":
|
||||
return HeadCommand(uri, port)
|
||||
elif command == "POST":
|
||||
elif method == "POST":
|
||||
return PostCommand(uri, port)
|
||||
elif command == "PUT":
|
||||
elif method == "PUT":
|
||||
return PutCommand(uri, port)
|
||||
else:
|
||||
raise ValueError()
|
||||
raise ValueError("Unknown HTTP method")
|
||||
|
||||
|
||||
class AbstractCommand(ABC):
|
||||
"""
|
||||
A class representing the command for sending an HTTP command.
|
||||
A class representing the command for sending an HTTP request.
|
||||
"""
|
||||
uri: str
|
||||
host: str
|
||||
@@ -51,10 +52,15 @@ class AbstractCommand(ABC):
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def command(self):
|
||||
def method(self):
|
||||
pass
|
||||
|
||||
def execute(self, sub_request=False):
|
||||
"""
|
||||
Creates and sends the HTTP message for this Command.
|
||||
|
||||
@param sub_request: If this execution is in function of a prior command.
|
||||
"""
|
||||
self.sub_request = sub_request
|
||||
(host, path) = self.parse_uri()
|
||||
|
||||
@@ -69,9 +75,10 @@ class AbstractCommand(ABC):
|
||||
client.conn.connect((host, self.port))
|
||||
sockets[host] = client
|
||||
|
||||
message = f"{self.command} {path} HTTP/1.1\r\n"
|
||||
message = f"{self.method} {path} HTTP/1.1\r\n"
|
||||
message += f"Host: {host}:{self.port}\r\n"
|
||||
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
||||
message += "Accept: */*\r\n"
|
||||
message += "Accept-Encoding: identity\r\n"
|
||||
encoded_msg = self._build_message(message)
|
||||
|
||||
logging.debug("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
|
||||
@@ -81,8 +88,7 @@ class AbstractCommand(ABC):
|
||||
logging.info("HTTP request sent, awaiting response...")
|
||||
|
||||
try:
|
||||
retriever = PreambleRetriever(client)
|
||||
self._await_response(client, retriever)
|
||||
self._await_response(client)
|
||||
except InvalidResponse as e:
|
||||
logging.debug("Internal error: Response could not be parsed", exc_info=e)
|
||||
return
|
||||
@@ -95,7 +101,12 @@ class AbstractCommand(ABC):
|
||||
if not sub_request:
|
||||
client.close()
|
||||
|
||||
def _await_response(self, client, retriever):
|
||||
def _await_response(self, client):
|
||||
"""
|
||||
Simple response method.
|
||||
|
||||
Receives the response and prints to stdout.
|
||||
"""
|
||||
while True:
|
||||
line = client.read_line()
|
||||
print(line, end="")
|
||||
@@ -106,11 +117,15 @@ class AbstractCommand(ABC):
|
||||
return (message + "\r\n").encode(FORMAT)
|
||||
|
||||
def parse_uri(self):
|
||||
"""
|
||||
Parses the URI and returns the hostname and path.
|
||||
@return: A tuple of the hostname and path.
|
||||
"""
|
||||
parsed = urlparse(self.uri)
|
||||
|
||||
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
||||
if parsed.netloc == "":
|
||||
parsed = urlparse("//" + self.uri)
|
||||
parsed = urlparse("http://" + self.uri)
|
||||
|
||||
host = parsed.netloc
|
||||
path = parsed.path
|
||||
@@ -126,11 +141,11 @@ class AbstractCommand(ABC):
|
||||
|
||||
class AbstractWithBodyCommand(AbstractCommand, ABC):
|
||||
"""
|
||||
The building block for creating an HTTP message for an HTTP command with a body.
|
||||
The building block for creating an HTTP message for an HTTP method with a body (POST and PUT).
|
||||
"""
|
||||
|
||||
def _build_message(self, message: str) -> bytes:
|
||||
body = input(f"Enter {self.command} data: ").encode(FORMAT)
|
||||
body = input(f"Enter {self.method} data: ").encode(FORMAT)
|
||||
print()
|
||||
|
||||
message += "Content-Type: text/plain\r\n"
|
||||
@@ -145,29 +160,36 @@ class AbstractWithBodyCommand(AbstractCommand, ABC):
|
||||
|
||||
class HeadCommand(AbstractCommand):
|
||||
"""
|
||||
A Command for sending a `HEAD` message.
|
||||
A Command for sending a `HEAD` request.
|
||||
"""
|
||||
|
||||
@property
|
||||
def command(self):
|
||||
def method(self):
|
||||
return "HEAD"
|
||||
|
||||
|
||||
class GetCommand(AbstractCommand):
|
||||
"""
|
||||
A Command for sending a `GET` message.
|
||||
A Command for sending a `GET` request.
|
||||
"""
|
||||
dir: str
|
||||
|
||||
def __init__(self, uri: str, port, dir=None):
|
||||
def __init__(self, uri: str, port, directory=None):
|
||||
super().__init__(uri, port)
|
||||
self.dir = dir
|
||||
self.dir = directory
|
||||
self.filename = None
|
||||
|
||||
@property
|
||||
def command(self):
|
||||
def method(self):
|
||||
return "GET"
|
||||
|
||||
def _get_preamble(self, retriever):
|
||||
def _get_preamble(self, client):
|
||||
"""
|
||||
Returns the preamble (start-line and headers) of the response of this command.
|
||||
@param client: the client object to retrieve from
|
||||
@return: A Message object containing the HTTP-version, status code, status message, headers and buffer
|
||||
"""
|
||||
retriever = PreambleRetriever(client)
|
||||
lines = retriever.retrieve()
|
||||
(version, status, msg) = parser.parse_status_line(next(lines))
|
||||
headers = parser.parse_headers(lines)
|
||||
@@ -177,8 +199,11 @@ class GetCommand(AbstractCommand):
|
||||
|
||||
return Message(version, status, msg, headers, buffer)
|
||||
|
||||
def _await_response(self, client, retriever):
|
||||
msg = self._get_preamble(retriever)
|
||||
def _await_response(self, client):
|
||||
"""
|
||||
Handles the response of this command.
|
||||
"""
|
||||
msg = self._get_preamble(client)
|
||||
|
||||
from client import response_handler
|
||||
self.filename = response_handler.handle(client, msg, self, self.dir)
|
||||
@@ -186,19 +211,19 @@ class GetCommand(AbstractCommand):
|
||||
|
||||
class PostCommand(AbstractWithBodyCommand):
|
||||
"""
|
||||
A command for sending a `POST` command.
|
||||
A command for sending a `POST` request.
|
||||
"""
|
||||
|
||||
@property
|
||||
def command(self):
|
||||
def method(self):
|
||||
return "POST"
|
||||
|
||||
|
||||
class PutCommand(AbstractWithBodyCommand):
|
||||
"""
|
||||
A command for sending a `PUT` command.
|
||||
A command for sending a `PUT` request.
|
||||
"""
|
||||
|
||||
@property
|
||||
def command(self):
|
||||
def method(self):
|
||||
return "PUT"
|
||||
|
@@ -2,11 +2,6 @@ import socket
|
||||
|
||||
from httplib.httpsocket import HTTPSocket, InvalidResponse
|
||||
|
||||
BUFSIZE = 4096
|
||||
TIMEOUT = 3
|
||||
FORMAT = "UTF-8"
|
||||
MAXLINE = 4096
|
||||
|
||||
|
||||
class HTTPClient(HTTPSocket):
|
||||
host: str
|
||||
|
@@ -4,15 +4,17 @@ import re
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlsplit, unquote
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from client.command import AbstractCommand, GetCommand
|
||||
from client.httpclient import HTTPClient, FORMAT
|
||||
from client.httpclient import HTTPClient
|
||||
from httplib import parser
|
||||
from httplib.exceptions import InvalidResponse
|
||||
from httplib.httpsocket import FORMAT
|
||||
from httplib.message import ClientMessage as Message
|
||||
from httplib.retriever import Retriever
|
||||
|
||||
BASE_REGEX = re.compile(r"<\s*base.*href\s*=\s*['\"](\S*)['\"][^>]*>", re.M | re.I)
|
||||
IMG_REGEX = re.compile(r"<\s*img[^>]*\ssrc\s*=\s*['\"]([^\"']+)['\"][^>]*>", re.M | re.I)
|
||||
|
||||
|
||||
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, directory=None):
|
||||
handler = BasicResponseHandler(client, msg, command)
|
||||
@@ -83,8 +85,10 @@ class BasicResponseHandler(ResponseHandler):
|
||||
|
||||
if 300 <= self.msg.status < 400:
|
||||
# Redirect
|
||||
self._skip_body()
|
||||
return self._do_handle_redirect()
|
||||
if 400 <= self.msg.status < 600:
|
||||
self._skip_body()
|
||||
# Dump headers and exit with error
|
||||
if not self.cmd.sub_request:
|
||||
print("".join(self.msg.raw), end="")
|
||||
@@ -93,8 +97,6 @@ class BasicResponseHandler(ResponseHandler):
|
||||
return None
|
||||
|
||||
def _do_handle_redirect(self):
|
||||
self._skip_body()
|
||||
|
||||
if self.msg.status == 304:
|
||||
print("".join(self.msg.raw), end="")
|
||||
return None
|
||||
@@ -203,40 +205,61 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
file.write(buffer)
|
||||
file.close()
|
||||
|
||||
self._download_images(tmp_path, self.path)
|
||||
charset = parser.get_charset(self.msg.headers)
|
||||
self._download_images(tmp_path, self.path, charset)
|
||||
os.remove(tmp_path)
|
||||
return self.path
|
||||
|
||||
def _download_images(self, tmp_filename, target_filename):
|
||||
def _download_images(self, tmp_filename, target_filename, charset=FORMAT):
|
||||
|
||||
with open(tmp_filename, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, 'lxml')
|
||||
try:
|
||||
fp = open(tmp_filename, "r", encoding=charset)
|
||||
html = fp.read()
|
||||
except UnicodeDecodeError:
|
||||
fp = open(tmp_filename, "r", encoding=FORMAT, errors="replace")
|
||||
html = fp.read()
|
||||
|
||||
base_element = soup.find("base")
|
||||
fp.close()
|
||||
|
||||
base_element = BASE_REGEX.search(html)
|
||||
base_url = self.cmd.uri
|
||||
if base_element:
|
||||
base_url = parser.urljoin(self.cmd.uri, base_element["href"])
|
||||
base_url = parser.urljoin(self.cmd.uri, base_element.group(1))
|
||||
|
||||
processed = {}
|
||||
tag: Tag
|
||||
for tag in soup.find_all("img"):
|
||||
to_replace = []
|
||||
|
||||
for m in IMG_REGEX.finditer(html):
|
||||
url_start = m.start(1)
|
||||
url_end = m.end(1)
|
||||
target = m.group(1)
|
||||
|
||||
try:
|
||||
if not tag.has_attr("src"):
|
||||
if len(target) == 0:
|
||||
continue
|
||||
if target in processed:
|
||||
new_url = processed.get(target)
|
||||
else:
|
||||
new_url = self.__download_image(target, base_url)
|
||||
if not new_url:
|
||||
# Image failed to download
|
||||
continue
|
||||
|
||||
if tag["src"] in processed:
|
||||
new_url = processed.get(tag["src"])
|
||||
else:
|
||||
new_url = self.__download_image(tag["src"], base_url)
|
||||
processed[tag["src"]] = new_url
|
||||
if new_url:
|
||||
tag["src"] = os.path.basename(new_url)
|
||||
except Exception as e:
|
||||
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
|
||||
processed[target] = new_url
|
||||
|
||||
with open(target_filename, 'w') as file:
|
||||
file.write(soup.prettify(formatter="minimal"))
|
||||
if new_url:
|
||||
local_path = os.path.basename(new_url)
|
||||
to_replace.append((url_start, url_end, local_path))
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Failed to download image: %s, skipping...", target, exc_info=e)
|
||||
|
||||
to_replace.reverse()
|
||||
for (start, end, path) in to_replace:
|
||||
html = html[:start] + path + html[end:]
|
||||
|
||||
with open(target_filename, 'w', encoding=FORMAT) as file:
|
||||
file.write(html)
|
||||
|
||||
def __download_image(self, img_src, base_url):
|
||||
"""
|
||||
|
@@ -1,9 +1,11 @@
|
||||
import logging
|
||||
import re
|
||||
import urllib
|
||||
from typing import Dict
|
||||
from urllib.parse import urlparse, urlsplit
|
||||
|
||||
from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest, InvalidRequestLine
|
||||
from httplib.httpsocket import FORMAT
|
||||
|
||||
|
||||
def _is_valid_http_version(http_version: str):
|
||||
@@ -164,7 +166,7 @@ def get_uri(url: str):
|
||||
parsed = urlsplit(url)
|
||||
|
||||
result = f"http://{parsed.netloc}{parsed.path}"
|
||||
if parsed.query != '':
|
||||
if parsed.query != "":
|
||||
result = f"{result}?{parsed.query}"
|
||||
|
||||
return result
|
||||
@@ -175,3 +177,13 @@ def urljoin(base, url):
|
||||
Join a base url and a URL to form a absolute url.
|
||||
"""
|
||||
return urllib.parse.urljoin(base, url)
|
||||
|
||||
|
||||
def get_charset(headers: Dict[str, str]):
|
||||
if "content-type" in headers:
|
||||
content_type = headers["content-type"]
|
||||
match = re.search(r"charset\s*=\s*([a-z\-0-9]*)", content_type, re.I)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return FORMAT
|
||||
|
@@ -1,3 +1 @@
|
||||
beautifulsoup4~=4.9.3
|
||||
lxml~=4.6.2
|
||||
cssutils~=2.2.0
|
@@ -6,12 +6,12 @@ from datetime import datetime
|
||||
from time import mktime
|
||||
from wsgiref.handlers import format_date_time
|
||||
|
||||
from client.httpclient import FORMAT
|
||||
from httplib import parser
|
||||
from httplib.exceptions import NotFound, Forbidden, NotModified
|
||||
from httplib.httpsocket import FORMAT
|
||||
from httplib.message import ServerMessage as Message
|
||||
|
||||
root = os.path.join(os.path.dirname(sys.argv[0]), "public")
|
||||
CONTENT_ROOT = os.path.join(os.path.dirname(sys.argv[0]), "public")
|
||||
|
||||
status_message = {
|
||||
200: "OK",
|
||||
@@ -26,6 +26,12 @@ status_message = {
|
||||
|
||||
|
||||
def create(message: Message):
|
||||
"""
|
||||
Creates a Command based on the specified message
|
||||
@param message: the message to create the Command with.
|
||||
@return: An instance of `AbstractCommand`
|
||||
"""
|
||||
|
||||
if message.method == "GET":
|
||||
return GetCommand(message)
|
||||
elif message.method == "HEAD":
|
||||
@@ -102,9 +108,9 @@ class AbstractCommand(ABC):
|
||||
norm_path = os.path.normpath(self.msg.target.path)
|
||||
|
||||
if norm_path == "/":
|
||||
path = root + "/index.html"
|
||||
path = CONTENT_ROOT + "/index.html"
|
||||
else:
|
||||
path = root + norm_path
|
||||
path = CONTENT_ROOT + norm_path
|
||||
|
||||
if check and not os.path.exists(path):
|
||||
raise NotFound(path)
|
||||
@@ -169,7 +175,7 @@ class AbstractModifyCommand(AbstractCommand, ABC):
|
||||
else:
|
||||
status = 201
|
||||
|
||||
location = parser.urljoin("/", os.path.relpath(path, root))
|
||||
location = parser.urljoin("/", os.path.relpath(path, CONTENT_ROOT))
|
||||
return self._build_message(status, "text/plain", b"", {"Location": location})
|
||||
|
||||
|
||||
|
@@ -3,11 +3,6 @@ import socket
|
||||
from httplib.exceptions import BadRequest
|
||||
from httplib.httpsocket import HTTPSocket
|
||||
|
||||
BUFSIZE = 4096
|
||||
TIMEOUT = 3
|
||||
FORMAT = "UTF-8"
|
||||
MAXLINE = 4096
|
||||
|
||||
|
||||
class ServerSocket(HTTPSocket):
|
||||
|
||||
|
Reference in New Issue
Block a user