Client update

This commit is contained in:
2021-03-19 03:29:35 +01:00
parent 1966a174bb
commit 797cdb0c0e
4 changed files with 425 additions and 37 deletions

View File

@@ -1,26 +1,110 @@
import logging
import os
from socket import socket
from typing import Dict
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
def handle(client: HTTPClient, url: str):
logging.debug("Waiting for response")
try:
buffer = client.receive()
except TimeoutError:
print("[ABRT] Response timed out")
return
try:
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
(status_line, headers) = client.parse_headers(header_chunk)
client.validate_status_line(status_line)
status_code = int(status_line.split(" ")[1])
response_handler = construct(client, headers, status_code, url)
response_handler.handle(buffer)
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
print("[ABRT] Invalid response")
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
print("[ABRT] Invalid response")
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
print("[ABRT] Invalid response")
return
def construct(client: HTTPClient, headers, status_code, url):
# only chunked transfer-encoding is supported
transfer_encoding = headers.get("transfer-encoding")
if transfer_encoding and transfer_encoding != "chunked":
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
chunked = transfer_encoding
# content-encoding is not supported
content_encoding = headers.get("content-encoding")
if content_encoding:
raise UnsupportedEncoding("content-encoding", content_encoding)
if chunked:
return ChunkedResponseHandler(client, headers, status_code, url)
else:
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLResponseHandler(client, headers, status_code, url)
return PlainResponseHandler(client, headers, status_code, url)
def parse_uri(uri: str):
parsed = urlparse(uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + uri)
host = parsed.netloc
path = parsed.path
if len(path) == 0 or path[0] != '/':
path = "/" + path
return host, path
class ResponseHandler:
client: socket
url: str
client: HTTPClient
headers: Dict[str, str]
status_code: int
url: str
def __init__(self, url: str, client: socket):
self.headers = {}
self.url = url
def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str):
self.client = client
self.headers = headers
self.status_code = status_code
self.url = url
pass
def get_html_filename(self):
def handle(self, buffer: bytes):
pass
def get_filename(self):
"""Returns the filename to download the payload to.
"""
filename = "index.html"
parsed = urlparse(self.url)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + self.url)
# If the path contains a `/` get only the last part and use it as filename
# If the path end with a `/`, it's a directory so ignore it.
if len(parsed.path) != 0:
index = parsed.path.rfind("/")
if index == -1:
@@ -29,14 +113,179 @@ class ResponseHandler:
filename = parsed.path[index:]
result = os.path.basename(filename).strip()
return result
if any(letter.isalnum() for letter in result):
return result
return "index.html"
def _handle_download(self, client, url):
logging.debug("Waiting for response")
try:
buffer = client.receive()
except TimeoutError:
print("[ABRT] Response timed out")
return
try:
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
(status_line, headers) = client.parse_headers(header_chunk)
client.validate_status_line(status_line)
status_code = int(status_line.split(" ")[1])
if status_code != 200:
raise InvalidResponse("Code not 200")
response_handler = construct(client, headers, status_code, url)
filename = response_handler.handle(buffer)
return filename
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
print("[ABRT] Invalid response")
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
print("[ABRT] Invalid response")
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
print("[ABRT] Invalid response")
return
class PlainResponseHandler(ResponseHandler):
def __init__(self, url: str, client: socket):
super().__init__(url, client)
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def _get_payload_size(self):
content_length = self.__get_content_length()
if content_length == 0:
logging.debug("content-length is 0")
return None
payload_size = content_length
if not content_length:
payload_size = -1
logging.debug("No content-length specified")
else:
logging.debug("Expected content-length=%s", payload_size)
return payload_size
def handle(self, buffer: bytes):
payload_size = self._get_payload_size()
if payload_size is None:
return
logging.debug("Retrieving payload")
filename = self.get_filename()
file = open(filename, "wb")
self._retrieve(file, buffer, payload_size)
file.close()
return filename
def _retrieve(self, file, buffer: bytes, payload_size: int):
file.write(buffer)
cur_payload_size = len(buffer)
while cur_payload_size < payload_size:
buffer = self.client.receive()
logging.debug("Received payload length: %s", len(buffer))
if len(buffer) == 0:
logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size)
break
cur_payload_size += len(buffer)
logging.debug("Processed payload: %r", cur_payload_size)
file.write(buffer)
def __get_content_length(self):
content_length = self.headers.get("content-length")
if not content_length:
return None
return int(content_length)
class HTMLResponseHandler(PlainResponseHandler):
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def handle(self, buffer: bytes):
payload_size = self._get_payload_size()
if payload_size is None:
return
logging.debug("Retrieving payload")
filename = self.get_filename()
tmp_filename = "." + filename + ".tmp"
file = open(tmp_filename, "wb")
self._retrieve(file, buffer, payload_size)
file.close()
self.__download_images(tmp_filename, filename)
os.remove(tmp_filename)
return filename
def __download_images(self, tmp_filename, target_filename):
(host, path) = parse_uri(self.url)
with open(tmp_filename, "r") as fp:
soup = BeautifulSoup(fp, "lxml")
for tag in soup.find_all("img"):
try:
tag["src"] = self.__download_image(tag["src"], host, path)
except Exception as e:
logging.error("Failed to download image, skipping...", exc_info=e)
with open(target_filename, 'w') as file:
file.write(str(soup))
def __download_image(self, img_src, host, path):
parsed = urlparse(img_src)
same_host = True
if len(parsed.netloc) == 0 or parsed.netloc == host:
img_host = host
if parsed.path[0] != "/":
base = os.path.split(path)[0]
if base[-1] != '/':
base += "/"
img_path = base + parsed.path
else:
img_path = parsed.path
else:
same_host = False
(img_host, img_path) = parse_uri(img_src)
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "Host: {host}\r\n\r\n".format(host=host)
message = message.encode(FORMAT)
if same_host:
client = self.client
else:
client = HTTPClient(img_src)
client.connect((img_host, 80))
client.sendall(message)
filename = self._handle_download(client, img_host + img_path)
if not same_host:
client.close()
return filename
class ChunkedResponseHandler(ResponseHandler):
def __init__(self, url: str, client: socket):
super().__init__(url, client)
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def handle(self, buffer: bytes):
return None