client: cleanup

This commit is contained in:
2021-03-21 00:01:31 +01:00
parent fa8d08d63d
commit d8a5765fd8
4 changed files with 242 additions and 374 deletions

View File

@@ -10,121 +10,6 @@ from client.Retriever import Retriever
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
def handle(client: HTTPClient, url: str):
logging.debug("Waiting for response")
try:
(version, status, _) = get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = get_headers(client)
logging.debug("Parsed headers: %r", headers)
response_handler = construct(client, headers, status, url)
response_handler.handle()
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
return
def get_status_line(client: HTTPClient):
line = client.read_line()
split = list(filter(None, line.split(" ")))
if len(split) < 3:
raise InvalidStatusLine(line)
# Check HTTP version
http_version = split.pop(0)
if len(http_version) < 8 or http_version[4] != "/":
raise InvalidStatusLine(line)
(name, version) = http_version[:4], http_version[5:]
if name != "HTTP" or not re.match(r"1\.[0|1]", version):
raise InvalidStatusLine(line)
status = split.pop(0)
if not re.match(r"\d{3}", status):
raise InvalidStatusLine(line)
status = int(status)
if status < 100 or status > 999:
raise InvalidStatusLine(line)
reason = split.pop(0)
return version, status, reason
def get_headers(client: HTTPClient):
headers = []
# first header after the status-line may not contain a space
while True:
line = client.read_line()
if line[0].isspace():
continue
else:
break
while True:
if line in ("\r\n", "\n", " "):
break
if line[0].isspace():
headers[-1] = headers[-1].rstrip("\r\n")
headers.append(line.lstrip())
line = client.read_line()
result = {}
header_str = "".join(headers)
for line in header_str.splitlines():
pos = line.find(":")
if pos <= 0 or pos >= len(line) - 1:
continue
(header, value) = map(str.strip, line.split(":", 1))
check_next_header(result, header, value)
result[header.lower()] = value.lower()
return result
def check_next_header(headers, next_header: str, next_value: str):
if next_header == "content-length":
if "content-length" in headers:
logging.error("Multiple content-length headers specified")
raise InvalidResponse()
if not next_value.isnumeric() or int(next_value) <= 0:
logging.error("Invalid content-length value: %r", next_value)
raise InvalidResponse()
def construct(client: HTTPClient, headers, status_code, url):
# only chunked transfer-encoding is supported
transfer_encoding = headers.get("transfer-encoding")
if transfer_encoding and transfer_encoding != "chunked":
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
chunked = transfer_encoding
# content-encoding is not supported
content_encoding = headers.get("content-encoding")
if content_encoding:
raise UnsupportedEncoding("content-encoding", content_encoding)
retriever = Retriever.create(client, headers)
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, headers, url)
return RawDownloadHandler(retriever, client, headers, url)
def parse_uri(uri: str):
parsed = urlparse(uri)
@@ -156,6 +41,98 @@ class ResponseHandler:
def handle(self):
pass
@staticmethod
def create(client: HTTPClient, headers, status_code, url):
# only chunked transfer-encoding is supported
transfer_encoding = headers.get("transfer-encoding")
if transfer_encoding and transfer_encoding != "chunked":
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
chunked = transfer_encoding
# content-encoding is not supported
content_encoding = headers.get("content-encoding")
if content_encoding:
raise UnsupportedEncoding("content-encoding", content_encoding)
retriever = Retriever.create(client, headers)
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, headers, url)
return RawDownloadHandler(retriever, client, headers, url)
@staticmethod
def get_status_line(client: HTTPClient):
line = client.read_line()
split = list(filter(None, line.split(" ")))
if len(split) < 3:
raise InvalidStatusLine(line)
# Check HTTP version
http_version = split.pop(0)
if len(http_version) < 8 or http_version[4] != "/":
raise InvalidStatusLine(line)
(name, version) = http_version[:4], http_version[5:]
if name != "HTTP" or not re.match(r"1\.[0|1]", version):
raise InvalidStatusLine(line)
status = split.pop(0)
if not re.match(r"\d{3}", status):
raise InvalidStatusLine(line)
status = int(status)
if status < 100 or status > 999:
raise InvalidStatusLine(line)
reason = split.pop(0)
return version, status, reason
@staticmethod
def get_headers(client: HTTPClient):
headers = []
# first header after the status-line may not contain a space
while True:
line = client.read_line()
if line[0].isspace():
continue
else:
break
while True:
if line in ("\r\n", "\n", " "):
break
if line[0].isspace():
headers[-1] = headers[-1].rstrip("\r\n")
headers.append(line.lstrip())
line = client.read_line()
result = {}
header_str = "".join(headers)
for line in header_str.splitlines():
pos = line.find(":")
if pos <= 0 or pos >= len(line) - 1:
continue
(header, value) = map(str.strip, line.split(":", 1))
ResponseHandler.check_next_header(result, header, value)
result[header.lower()] = value.lower()
return result
@staticmethod
def check_next_header(headers, next_header: str, next_value: str):
if next_header == "content-length":
if "content-length" in headers:
logging.error("Multiple content-length headers specified")
raise InvalidResponse()
if not next_value.isnumeric() or int(next_value) <= 0:
logging.error("Invalid content-length value: %r", next_value)
raise InvalidResponse()
class DownloadHandler(ResponseHandler):
path: str
@@ -220,9 +197,9 @@ class DownloadHandler(ResponseHandler):
def _handle_sub_request(self, client, url):
(version, status, _) = get_status_line(client)
(version, status, _) = self.get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = get_headers(client)
headers = self.get_headers(client)
logging.debug("Parsed headers: %r", headers)
if status != 200:
@@ -275,30 +252,38 @@ class HTMLDownloadHandler(DownloadHandler):
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'html.parser')
base_url = self.url
base_element = soup.find("base")
if base_element:
base_url = base_element["href"]
for tag in soup.find_all("img"):
try:
tag["src"] = self.__download_image(tag["src"], host, path)
tag["src"] = self.__download_image(tag["src"], host, base_url)
except Exception as e:
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
logging.debug(e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
with open(target_filename, 'w') as file:
file.write(str(soup))
def __download_image(self, img_src, host, path):
def __download_image(self, img_src, host, base_url):
parsed = urlparse(img_src)
logging.debug("Downloading image: %s", img_src)
same_host = True
if len(parsed.netloc) == 0 and parsed.path != "/":
# relative url, append base_url
img_src = os.path.join(os.path.dirname(base_url), parsed.path)
parsed = urlparse(img_src)
# Check if the image is located on the same server
if len(parsed.netloc) == 0 or parsed.netloc == host:
same_host = True
img_host = host
if parsed.path[0] != "/":
base = os.path.split(path)[0]
if base[-1] != '/':
base += "/"
img_path = base + parsed.path
else:
img_path = parsed.path
img_path = parsed.path
else:
same_host = False
(img_host, img_path) = parse_uri(img_src)

126
client/command.py Normal file
View File

@@ -0,0 +1,126 @@
import logging
from urllib.parse import urlparse
from client.ResponseHandler import ResponseHandler
from client.httpclient import FORMAT, HTTPClient, InvalidResponse, InvalidStatusLine, UnsupportedEncoding
class Command:
command: str
def __init__(self, url: str, port: str):
self.url = url
self.port = port
@staticmethod
def create(command: str, url: str, port: str):
if command == "GET":
return GetCommand(url, port)
elif command == "HEAD":
return HeadCommand(url, port)
elif command == "POST":
return PostCommand(url, port)
elif command == "PUT":
return PutCommand(url, port)
else:
raise ValueError()
def execute(self):
(host, path) = self.parse_uri()
client = HTTPClient(host)
client.connect((host, int(self.port)))
message = f"{self.command} {path} HTTP/1.1\r\n"
message += f"Host: {host}\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
encoded_msg = self._build_message(message)
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
logging.debug("Sending HTTP message: %r", encoded_msg)
client.sendall(encoded_msg)
logging.info("HTTP request sent, awaiting response...")
try:
self._await_response(client)
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
finally:
client.close()
def _await_response(self, client: HTTPClient):
pass
def _build_message(self, message: str) -> bytes:
return (message + "\r\n").encode(FORMAT)
def parse_uri(self):
parsed = urlparse(self.url)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + self.url)
host = parsed.netloc
path = parsed.path
if len(path) == 0 or path[0] != '/':
path = "/" + path
port_pos = host.find(":")
if port_pos >= 0:
host = host[:port_pos]
return host, path
class HeadCommand(Command):
command = "HEAD"
def _await_response(self, client):
while True:
line = client.read_line()
print(line, end="")
if line in ("\r\n", "\n", ""):
break
class GetCommand(Command):
command = "GET"
def _await_response(self, client):
(version, status, msg) = ResponseHandler.get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = ResponseHandler.get_headers(client)
logging.debug("Parsed headers: %r", headers)
handler = ResponseHandler.create(client, headers, status, self.url)
handler.handle()
class PostCommand(HeadCommand):
command = "POST"
def _build_message(self, message: str) -> bytes:
body = input("Enter POST data: ").encode(FORMAT)
print()
message += "Content-Type: text/plain\r\n"
message += f"Content-Length: {len(body)}\r\n"
message += "\r\n"
message = message.encode(FORMAT)
message += body
message += b"\r\n"
return message
class PutCommand(PostCommand):
command = "PUT"