Files
CN2021/client/ResponseHandler.py
2021-03-19 03:29:35 +01:00

292 lines
9.3 KiB
Python

import logging
import os
from typing import Dict
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
def handle(client: HTTPClient, url: str):
logging.debug("Waiting for response")
try:
buffer = client.receive()
except TimeoutError:
print("[ABRT] Response timed out")
return
try:
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
(status_line, headers) = client.parse_headers(header_chunk)
client.validate_status_line(status_line)
status_code = int(status_line.split(" ")[1])
response_handler = construct(client, headers, status_code, url)
response_handler.handle(buffer)
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
print("[ABRT] Invalid response")
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
print("[ABRT] Invalid response")
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
print("[ABRT] Invalid response")
return
def construct(client: HTTPClient, headers, status_code, url):
# only chunked transfer-encoding is supported
transfer_encoding = headers.get("transfer-encoding")
if transfer_encoding and transfer_encoding != "chunked":
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
chunked = transfer_encoding
# content-encoding is not supported
content_encoding = headers.get("content-encoding")
if content_encoding:
raise UnsupportedEncoding("content-encoding", content_encoding)
if chunked:
return ChunkedResponseHandler(client, headers, status_code, url)
else:
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLResponseHandler(client, headers, status_code, url)
return PlainResponseHandler(client, headers, status_code, url)
def parse_uri(uri: str):
parsed = urlparse(uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + uri)
host = parsed.netloc
path = parsed.path
if len(path) == 0 or path[0] != '/':
path = "/" + path
return host, path
class ResponseHandler:
client: HTTPClient
headers: Dict[str, str]
status_code: int
url: str
def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str):
self.client = client
self.headers = headers
self.status_code = status_code
self.url = url
pass
def handle(self, buffer: bytes):
pass
def get_filename(self):
"""Returns the filename to download the payload to.
"""
filename = "index.html"
parsed = urlparse(self.url)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + self.url)
# If the path contains a `/` get only the last part and use it as filename
# If the path end with a `/`, it's a directory so ignore it.
if len(parsed.path) != 0:
index = parsed.path.rfind("/")
if index == -1:
filename = parsed.path
elif parsed.path[-1] != "/":
filename = parsed.path[index:]
result = os.path.basename(filename).strip()
if any(letter.isalnum() for letter in result):
return result
return "index.html"
def _handle_download(self, client, url):
logging.debug("Waiting for response")
try:
buffer = client.receive()
except TimeoutError:
print("[ABRT] Response timed out")
return
try:
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
(status_line, headers) = client.parse_headers(header_chunk)
client.validate_status_line(status_line)
status_code = int(status_line.split(" ")[1])
if status_code != 200:
raise InvalidResponse("Code not 200")
response_handler = construct(client, headers, status_code, url)
filename = response_handler.handle(buffer)
return filename
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
print("[ABRT] Invalid response")
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
print("[ABRT] Invalid response")
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
print("[ABRT] Invalid response")
return
class PlainResponseHandler(ResponseHandler):
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def _get_payload_size(self):
content_length = self.__get_content_length()
if content_length == 0:
logging.debug("content-length is 0")
return None
payload_size = content_length
if not content_length:
payload_size = -1
logging.debug("No content-length specified")
else:
logging.debug("Expected content-length=%s", payload_size)
return payload_size
def handle(self, buffer: bytes):
payload_size = self._get_payload_size()
if payload_size is None:
return
logging.debug("Retrieving payload")
filename = self.get_filename()
file = open(filename, "wb")
self._retrieve(file, buffer, payload_size)
file.close()
return filename
def _retrieve(self, file, buffer: bytes, payload_size: int):
file.write(buffer)
cur_payload_size = len(buffer)
while cur_payload_size < payload_size:
buffer = self.client.receive()
logging.debug("Received payload length: %s", len(buffer))
if len(buffer) == 0:
logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size)
break
cur_payload_size += len(buffer)
logging.debug("Processed payload: %r", cur_payload_size)
file.write(buffer)
def __get_content_length(self):
content_length = self.headers.get("content-length")
if not content_length:
return None
return int(content_length)
class HTMLResponseHandler(PlainResponseHandler):
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def handle(self, buffer: bytes):
payload_size = self._get_payload_size()
if payload_size is None:
return
logging.debug("Retrieving payload")
filename = self.get_filename()
tmp_filename = "." + filename + ".tmp"
file = open(tmp_filename, "wb")
self._retrieve(file, buffer, payload_size)
file.close()
self.__download_images(tmp_filename, filename)
os.remove(tmp_filename)
return filename
def __download_images(self, tmp_filename, target_filename):
(host, path) = parse_uri(self.url)
with open(tmp_filename, "r") as fp:
soup = BeautifulSoup(fp, "lxml")
for tag in soup.find_all("img"):
try:
tag["src"] = self.__download_image(tag["src"], host, path)
except Exception as e:
logging.error("Failed to download image, skipping...", exc_info=e)
with open(target_filename, 'w') as file:
file.write(str(soup))
def __download_image(self, img_src, host, path):
parsed = urlparse(img_src)
same_host = True
if len(parsed.netloc) == 0 or parsed.netloc == host:
img_host = host
if parsed.path[0] != "/":
base = os.path.split(path)[0]
if base[-1] != '/':
base += "/"
img_path = base + parsed.path
else:
img_path = parsed.path
else:
same_host = False
(img_host, img_path) = parse_uri(img_src)
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "Host: {host}\r\n\r\n".format(host=host)
message = message.encode(FORMAT)
if same_host:
client = self.client
else:
client = HTTPClient(img_src)
client.connect((img_host, 80))
client.sendall(message)
filename = self._handle_download(client, img_host + img_path)
if not same_host:
client.close()
return filename
class ChunkedResponseHandler(ResponseHandler):
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def handle(self, buffer: bytes):
return None