client: fixed GET
This commit is contained in:
@@ -1,46 +1,110 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Dict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from client.Retriever import Retriever
|
||||
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
|
||||
|
||||
|
||||
def handle(client: HTTPClient, url: str):
|
||||
logging.debug("Waiting for response")
|
||||
try:
|
||||
buffer = client.receive()
|
||||
except TimeoutError:
|
||||
print("[ABRT] Response timed out")
|
||||
return
|
||||
|
||||
try:
|
||||
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
|
||||
(status_line, headers) = client.parse_headers(header_chunk)
|
||||
client.validate_status_line(status_line)
|
||||
|
||||
status_code = int(status_line.split(" ")[1])
|
||||
|
||||
response_handler = construct(client, headers, status_code, url)
|
||||
response_handler.handle(buffer)
|
||||
(version, status, _) = get_status_line(client)
|
||||
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
|
||||
headers = get_headers(client)
|
||||
logging.debug("Parsed headers: %r", headers)
|
||||
|
||||
response_handler = construct(client, headers, status, url)
|
||||
response_handler.handle()
|
||||
|
||||
except InvalidResponse as e:
|
||||
logging.debug("Internal error: Response could not be parsed", exc_info=e)
|
||||
print("[ABRT] Invalid response")
|
||||
return
|
||||
except InvalidStatusLine as e:
|
||||
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
|
||||
print("[ABRT] Invalid response")
|
||||
return
|
||||
except UnsupportedEncoding as e:
|
||||
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
|
||||
print("[ABRT] Invalid response")
|
||||
return
|
||||
|
||||
|
||||
def get_status_line(client: HTTPClient):
|
||||
line = client.read_line()
|
||||
|
||||
split = list(filter(None, line.split(" ")))
|
||||
if len(split) < 3:
|
||||
raise InvalidStatusLine(line)
|
||||
|
||||
# Check HTTP version
|
||||
http_version = split.pop(0)
|
||||
if len(http_version) < 8 or http_version[4] != "/":
|
||||
raise InvalidStatusLine(line)
|
||||
|
||||
(name, version) = http_version[:4], http_version[5:]
|
||||
if name != "HTTP" or not re.match(r"1\.[0|1]", version):
|
||||
raise InvalidStatusLine(line)
|
||||
|
||||
status = split.pop(0)
|
||||
if not re.match(r"\d{3}", status):
|
||||
raise InvalidStatusLine(line)
|
||||
status = int(status)
|
||||
if status < 100 or status > 999:
|
||||
raise InvalidStatusLine(line)
|
||||
|
||||
reason = split.pop(0)
|
||||
return version, status, reason
|
||||
|
||||
|
||||
def get_headers(client: HTTPClient):
|
||||
headers = []
|
||||
# first header after the status-line may not contain a space
|
||||
while True:
|
||||
line = client.read_line()
|
||||
if line[0].isspace():
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
while True:
|
||||
if line in ("\r\n", "\n", " "):
|
||||
break
|
||||
|
||||
if line[0].isspace():
|
||||
headers[-1] = headers[-1].rstrip("\r\n")
|
||||
|
||||
headers.append(line.lstrip())
|
||||
line = client.read_line()
|
||||
|
||||
result = {}
|
||||
header_str = "".join(headers)
|
||||
for line in header_str.splitlines():
|
||||
pos = line.find(":")
|
||||
|
||||
if pos <= 0 or pos >= len(line) - 1:
|
||||
continue
|
||||
|
||||
(header, value) = map(str.strip, line.split(":", 1))
|
||||
check_next_header(result, header, value)
|
||||
result[header.lower()] = value.lower()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def check_next_header(headers, next_header: str, next_value: str):
|
||||
if next_header == "content-length":
|
||||
if "content-length" in headers:
|
||||
logging.error("Multiple content-length headers specified")
|
||||
raise InvalidResponse()
|
||||
if not next_value.isnumeric() or int(next_value) <= 0:
|
||||
logging.error("Invalid content-length value: %r", next_value)
|
||||
raise InvalidResponse()
|
||||
|
||||
|
||||
def construct(client: HTTPClient, headers, status_code, url):
|
||||
# only chunked transfer-encoding is supported
|
||||
transfer_encoding = headers.get("transfer-encoding")
|
||||
@@ -53,13 +117,12 @@ def construct(client: HTTPClient, headers, status_code, url):
|
||||
if content_encoding:
|
||||
raise UnsupportedEncoding("content-encoding", content_encoding)
|
||||
|
||||
if chunked:
|
||||
return ChunkedResponseHandler(client, headers, status_code, url)
|
||||
else:
|
||||
content_type = headers.get("content-type")
|
||||
if content_type and "text/html" in content_type:
|
||||
return HTMLResponseHandler(client, headers, status_code, url)
|
||||
return PlainResponseHandler(client, headers, status_code, url)
|
||||
retriever = Retriever.create(client, headers)
|
||||
|
||||
content_type = headers.get("content-type")
|
||||
if content_type and "text/html" in content_type:
|
||||
return HTMLDownloadHandler(retriever, client, headers, url)
|
||||
return RawDownloadHandler(retriever, client, headers, url)
|
||||
|
||||
|
||||
def parse_uri(uri: str):
|
||||
@@ -81,17 +144,54 @@ class ResponseHandler:
|
||||
headers: Dict[str, str]
|
||||
status_code: int
|
||||
url: str
|
||||
retriever: Retriever
|
||||
|
||||
def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str):
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
|
||||
self.client = client
|
||||
self.headers = headers
|
||||
self.status_code = status_code
|
||||
self.url = url
|
||||
self.retriever = retriever
|
||||
pass
|
||||
|
||||
def handle(self, buffer: bytes):
|
||||
def handle(self):
|
||||
pass
|
||||
|
||||
|
||||
class DownloadHandler(ResponseHandler):
|
||||
path: str
|
||||
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
||||
super().__init__(retriever, client, headers, url)
|
||||
|
||||
if not dir:
|
||||
dir = self._create_directory()
|
||||
|
||||
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
|
||||
|
||||
@staticmethod
|
||||
def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
||||
content_type = headers.get("content-type")
|
||||
if content_type and "text/html" in content_type:
|
||||
return HTMLDownloadHandler(retriever, client, headers, url, dir)
|
||||
return RawDownloadHandler(retriever, client, headers, url, dir)
|
||||
|
||||
def handle(self) -> str:
|
||||
pass
|
||||
|
||||
def _create_directory(self):
|
||||
path = self._get_duplicate_name(os.path.abspath(self.client.host))
|
||||
os.mkdir(path)
|
||||
return path
|
||||
|
||||
def _get_duplicate_name(self, path):
|
||||
tmp_path = path
|
||||
i = 0
|
||||
while os.path.exists(tmp_path):
|
||||
i += 1
|
||||
tmp_path = "{path}.{counter}".format(path=path, counter=i)
|
||||
|
||||
return tmp_path
|
||||
|
||||
def get_filename(self):
|
||||
"""Returns the filename to download the payload to.
|
||||
"""
|
||||
@@ -118,131 +218,68 @@ class ResponseHandler:
|
||||
|
||||
return "index.html"
|
||||
|
||||
def _handle_download(self, client, url):
|
||||
logging.debug("Waiting for response")
|
||||
try:
|
||||
buffer = client.receive()
|
||||
except TimeoutError:
|
||||
print("[ABRT] Response timed out")
|
||||
return
|
||||
def _handle_sub_request(self, client, url):
|
||||
|
||||
try:
|
||||
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
|
||||
(status_line, headers) = client.parse_headers(header_chunk)
|
||||
client.validate_status_line(status_line)
|
||||
(version, status, _) = get_status_line(client)
|
||||
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
|
||||
headers = get_headers(client)
|
||||
logging.debug("Parsed headers: %r", headers)
|
||||
|
||||
status_code = int(status_line.split(" ")[1])
|
||||
if status_code != 200:
|
||||
raise InvalidResponse("Code not 200")
|
||||
if status != 200:
|
||||
raise InvalidResponse("Status not expected 200: " + str(status))
|
||||
|
||||
response_handler = construct(client, headers, status_code, url)
|
||||
filename = response_handler.handle(buffer)
|
||||
retriever = Retriever.create(client, headers)
|
||||
handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
|
||||
|
||||
return filename
|
||||
return handler.handle()
|
||||
|
||||
|
||||
except InvalidResponse as e:
|
||||
logging.debug("Internal error: Response could not be parsed", exc_info=e)
|
||||
print("[ABRT] Invalid response")
|
||||
return
|
||||
except InvalidStatusLine as e:
|
||||
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
|
||||
print("[ABRT] Invalid response")
|
||||
return
|
||||
except UnsupportedEncoding as e:
|
||||
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
|
||||
print("[ABRT] Invalid response")
|
||||
return
|
||||
class RawDownloadHandler(DownloadHandler):
|
||||
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
||||
super().__init__(retriever, client, headers, url, dir)
|
||||
|
||||
class PlainResponseHandler(ResponseHandler):
|
||||
def __init__(self, client: HTTPClient, headers, status_code, url):
|
||||
super().__init__(client, headers, status_code, url)
|
||||
|
||||
def _get_payload_size(self):
|
||||
content_length = self.__get_content_length()
|
||||
if content_length == 0:
|
||||
logging.debug("content-length is 0")
|
||||
return None
|
||||
|
||||
payload_size = content_length
|
||||
if not content_length:
|
||||
payload_size = -1
|
||||
logging.debug("No content-length specified")
|
||||
else:
|
||||
logging.debug("Expected content-length=%s", payload_size)
|
||||
|
||||
return payload_size
|
||||
|
||||
def handle(self, buffer: bytes):
|
||||
payload_size = self._get_payload_size()
|
||||
if payload_size is None:
|
||||
return
|
||||
|
||||
def handle(self) -> str:
|
||||
logging.debug("Retrieving payload")
|
||||
filename = self.get_filename()
|
||||
file = open(filename, "wb")
|
||||
self._retrieve(file, buffer, payload_size)
|
||||
file.close()
|
||||
file = open(self.path, "wb")
|
||||
|
||||
return filename
|
||||
|
||||
def _retrieve(self, file, buffer: bytes, payload_size: int):
|
||||
|
||||
file.write(buffer)
|
||||
|
||||
cur_payload_size = len(buffer)
|
||||
while cur_payload_size < payload_size:
|
||||
buffer = self.client.receive()
|
||||
logging.debug("Received payload length: %s", len(buffer))
|
||||
|
||||
if len(buffer) == 0:
|
||||
logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size)
|
||||
break
|
||||
|
||||
cur_payload_size += len(buffer)
|
||||
logging.debug("Processed payload: %r", cur_payload_size)
|
||||
for buffer in self.retriever.retrieve():
|
||||
file.write(buffer)
|
||||
|
||||
def __get_content_length(self):
|
||||
content_length = self.headers.get("content-length")
|
||||
if not content_length:
|
||||
return None
|
||||
|
||||
return int(content_length)
|
||||
|
||||
|
||||
class HTMLResponseHandler(PlainResponseHandler):
|
||||
def __init__(self, client: HTTPClient, headers, status_code, url):
|
||||
super().__init__(client, headers, status_code, url)
|
||||
|
||||
def handle(self, buffer: bytes):
|
||||
payload_size = self._get_payload_size()
|
||||
if payload_size is None:
|
||||
return
|
||||
|
||||
logging.debug("Retrieving payload")
|
||||
filename = self.get_filename()
|
||||
tmp_filename = "." + filename + ".tmp"
|
||||
file = open(tmp_filename, "wb")
|
||||
self._retrieve(file, buffer, payload_size)
|
||||
file.close()
|
||||
|
||||
self.__download_images(tmp_filename, filename)
|
||||
os.remove(tmp_filename)
|
||||
return filename
|
||||
return self.path
|
||||
|
||||
|
||||
class HTMLDownloadHandler(DownloadHandler):
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
||||
super().__init__(retriever, client, headers, url, dir)
|
||||
|
||||
def handle(self) -> str:
|
||||
|
||||
(dir, file) = os.path.split(self.path)
|
||||
tmp_filename = ".{file}.tmp".format(file=file)
|
||||
tmp_path = os.path.join(dir, tmp_filename)
|
||||
file = open(tmp_path, "wb")
|
||||
|
||||
for buffer in self.retriever.retrieve():
|
||||
file.write(buffer)
|
||||
file.close()
|
||||
|
||||
self.__download_images(tmp_path, self.path)
|
||||
os.remove(tmp_path)
|
||||
return self.path
|
||||
|
||||
def __download_images(self, tmp_filename, target_filename):
|
||||
|
||||
(host, path) = parse_uri(self.url)
|
||||
with open(tmp_filename, "r") as fp:
|
||||
soup = BeautifulSoup(fp, "lxml")
|
||||
with open(tmp_filename, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, 'html.parser')
|
||||
|
||||
for tag in soup.find_all("img"):
|
||||
try:
|
||||
tag["src"] = self.__download_image(tag["src"], host, path)
|
||||
except Exception as e:
|
||||
logging.error("Failed to download image, skipping...", exc_info=e)
|
||||
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
|
||||
|
||||
with open(target_filename, 'w') as file:
|
||||
file.write(str(soup))
|
||||
@@ -250,6 +287,8 @@ class HTMLResponseHandler(PlainResponseHandler):
|
||||
def __download_image(self, img_src, host, path):
|
||||
parsed = urlparse(img_src)
|
||||
|
||||
logging.debug("Downloading image: %s", img_src)
|
||||
|
||||
same_host = True
|
||||
if len(parsed.netloc) == 0 or parsed.netloc == host:
|
||||
img_host = host
|
||||
@@ -271,21 +310,14 @@ class HTMLResponseHandler(PlainResponseHandler):
|
||||
|
||||
if same_host:
|
||||
client = self.client
|
||||
client.reset_request()
|
||||
else:
|
||||
client = HTTPClient(img_src)
|
||||
client.connect((img_host, 80))
|
||||
client.sendall(message)
|
||||
filename = self._handle_download(client, img_host + img_path)
|
||||
filename = self._handle_sub_request(client, img_host + img_path)
|
||||
|
||||
if not same_host:
|
||||
client.close()
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
class ChunkedResponseHandler(ResponseHandler):
|
||||
def __init__(self, client: HTTPClient, headers, status_code, url):
|
||||
super().__init__(client, headers, status_code, url)
|
||||
|
||||
def handle(self, buffer: bytes):
|
||||
return None
|
||||
|
Reference in New Issue
Block a user