308 lines
9.9 KiB
Python
308 lines
9.9 KiB
Python
import logging
|
|
import os
|
|
import re
|
|
from abc import ABC, abstractmethod
|
|
from typing import Dict
|
|
from urllib.parse import urlparse
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from client.Retriever import Retriever
|
|
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
|
|
|
|
|
|
class ResponseHandler(ABC):
|
|
client: HTTPClient
|
|
headers: Dict[str, str]
|
|
status_code: int
|
|
url: str
|
|
retriever: Retriever
|
|
|
|
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
|
|
self.client = client
|
|
self.headers = headers
|
|
self.url = url
|
|
self.retriever = retriever
|
|
pass
|
|
|
|
@abstractmethod
|
|
def handle(self):
|
|
pass
|
|
|
|
@staticmethod
|
|
def create(client: HTTPClient, headers, status_code, url):
|
|
# only chunked transfer-encoding is supported
|
|
transfer_encoding = headers.get("transfer-encoding")
|
|
if transfer_encoding and transfer_encoding != "chunked":
|
|
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
|
|
chunked = transfer_encoding
|
|
|
|
# content-encoding is not supported
|
|
content_encoding = headers.get("content-encoding")
|
|
if content_encoding:
|
|
raise UnsupportedEncoding("content-encoding", content_encoding)
|
|
|
|
retriever = Retriever.create(client, headers)
|
|
|
|
content_type = headers.get("content-type")
|
|
if content_type and "text/html" in content_type:
|
|
return HTMLDownloadHandler(retriever, client, headers, url)
|
|
return RawDownloadHandler(retriever, client, headers, url)
|
|
|
|
@staticmethod
|
|
def get_status_line(client: HTTPClient):
|
|
line = client.read_line()
|
|
|
|
split = list(filter(None, line.split(" ")))
|
|
if len(split) < 3:
|
|
raise InvalidStatusLine(line)
|
|
|
|
# Check HTTP version
|
|
http_version = split.pop(0)
|
|
if len(http_version) < 8 or http_version[4] != "/":
|
|
raise InvalidStatusLine(line)
|
|
|
|
(name, version) = http_version[:4], http_version[5:]
|
|
if name != "HTTP" or not re.match(r"1\.[0|1]", version):
|
|
raise InvalidStatusLine(line)
|
|
|
|
status = split.pop(0)
|
|
if not re.match(r"\d{3}", status):
|
|
raise InvalidStatusLine(line)
|
|
status = int(status)
|
|
if status < 100 or status > 999:
|
|
raise InvalidStatusLine(line)
|
|
|
|
reason = split.pop(0)
|
|
return version, status, reason
|
|
|
|
@staticmethod
|
|
def get_headers(client: HTTPClient):
|
|
headers = []
|
|
# first header after the status-line may not contain a space
|
|
while True:
|
|
line = client.read_line()
|
|
if line[0].isspace():
|
|
continue
|
|
else:
|
|
break
|
|
|
|
while True:
|
|
if line in ("\r\n", "\n", " "):
|
|
break
|
|
|
|
if line[0].isspace():
|
|
headers[-1] = headers[-1].rstrip("\r\n")
|
|
|
|
headers.append(line.lstrip())
|
|
line = client.read_line()
|
|
|
|
result = {}
|
|
header_str = "".join(headers)
|
|
for line in header_str.splitlines():
|
|
pos = line.find(":")
|
|
|
|
if pos <= 0 or pos >= len(line) - 1:
|
|
continue
|
|
|
|
(header, value) = map(str.strip, line.split(":", 1))
|
|
ResponseHandler.check_next_header(result, header, value)
|
|
result[header.lower()] = value.lower()
|
|
|
|
return result
|
|
|
|
@staticmethod
|
|
def check_next_header(headers, next_header: str, next_value: str):
|
|
if next_header == "content-length":
|
|
if "content-length" in headers:
|
|
logging.error("Multiple content-length headers specified")
|
|
raise InvalidResponse()
|
|
if not next_value.isnumeric() or int(next_value) <= 0:
|
|
logging.error("Invalid content-length value: %r", next_value)
|
|
raise InvalidResponse()
|
|
|
|
@staticmethod
|
|
def parse_uri(uri: str):
|
|
parsed = urlparse(uri)
|
|
|
|
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
|
if parsed.netloc == "":
|
|
parsed = urlparse("//" + uri)
|
|
|
|
host = parsed.netloc
|
|
path = parsed.path
|
|
if len(path) == 0 or path[0] != '/':
|
|
path = "/" + path
|
|
return host, path
|
|
|
|
|
|
class DownloadHandler(ResponseHandler, ABC):
|
|
path: str
|
|
|
|
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
|
super().__init__(retriever, client, headers, url)
|
|
|
|
if not dir:
|
|
dir = self._create_directory()
|
|
|
|
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
|
|
|
|
@staticmethod
|
|
def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
|
content_type = headers.get("content-type")
|
|
if content_type and "text/html" in content_type:
|
|
return HTMLDownloadHandler(retriever, client, headers, url, dir)
|
|
return RawDownloadHandler(retriever, client, headers, url, dir)
|
|
|
|
def _create_directory(self):
|
|
path = self._get_duplicate_name(os.path.abspath(self.client.host))
|
|
os.mkdir(path)
|
|
return path
|
|
|
|
def _get_duplicate_name(self, path):
|
|
tmp_path = path
|
|
i = 0
|
|
while os.path.exists(tmp_path):
|
|
i += 1
|
|
tmp_path = "{path}.{counter}".format(path=path, counter=i)
|
|
|
|
return tmp_path
|
|
|
|
def get_filename(self):
|
|
"""Returns the filename to download the payload to.
|
|
"""
|
|
filename = "index.html"
|
|
|
|
parsed = urlparse(self.url)
|
|
|
|
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
|
if parsed.netloc == "":
|
|
parsed = urlparse("//" + self.url)
|
|
|
|
# If the path contains a `/` get only the last part and use it as filename
|
|
# If the path end with a `/`, it's a directory so ignore it.
|
|
if len(parsed.path) != 0:
|
|
index = parsed.path.rfind("/")
|
|
if index == -1:
|
|
filename = parsed.path
|
|
elif parsed.path[-1] != "/":
|
|
filename = parsed.path[index:]
|
|
|
|
result = os.path.basename(filename).strip()
|
|
if any(letter.isalnum() for letter in result):
|
|
return result
|
|
|
|
return "index.html"
|
|
|
|
def _handle_sub_request(self, client, url):
|
|
|
|
(version, status, _) = self.get_status_line(client)
|
|
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
|
|
headers = self.get_headers(client)
|
|
logging.debug("Parsed headers: %r", headers)
|
|
|
|
if status != 200:
|
|
raise InvalidResponse("Status not expected 200: " + str(status))
|
|
|
|
retriever = Retriever.create(client, headers)
|
|
handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
|
|
|
|
return handler.handle()
|
|
|
|
|
|
class RawDownloadHandler(DownloadHandler):
|
|
|
|
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
|
super().__init__(retriever, client, headers, url, dir)
|
|
|
|
def handle(self) -> str:
|
|
logging.debug("Retrieving payload")
|
|
file = open(self.path, "wb")
|
|
|
|
for buffer in self.retriever.retrieve():
|
|
file.write(buffer)
|
|
file.close()
|
|
|
|
return self.path
|
|
|
|
|
|
class HTMLDownloadHandler(DownloadHandler):
|
|
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
|
super().__init__(retriever, client, headers, url, dir)
|
|
|
|
def handle(self) -> str:
|
|
|
|
(dir, file) = os.path.split(self.path)
|
|
tmp_filename = ".{file}.tmp".format(file=file)
|
|
tmp_path = os.path.join(dir, tmp_filename)
|
|
file = open(tmp_path, "wb")
|
|
|
|
for buffer in self.retriever.retrieve():
|
|
file.write(buffer)
|
|
file.close()
|
|
|
|
self.__download_images(tmp_path, self.path)
|
|
os.remove(tmp_path)
|
|
return self.path
|
|
|
|
def __download_images(self, tmp_filename, target_filename):
|
|
|
|
(host, path) = ResponseHandler.parse_uri(self.url)
|
|
with open(tmp_filename, "rb") as fp:
|
|
soup = BeautifulSoup(fp, 'html.parser')
|
|
|
|
base_url = self.url
|
|
base_element = soup.find("base")
|
|
|
|
if base_element:
|
|
base_url = base_element["href"]
|
|
|
|
for tag in soup.find_all("img"):
|
|
try:
|
|
tag["src"] = self.__download_image(tag["src"], host, base_url)
|
|
except Exception as e:
|
|
logging.debug(e)
|
|
logging.error("Failed to download image: %s, skipping...", tag["src"])
|
|
|
|
with open(target_filename, 'w') as file:
|
|
file.write(str(soup))
|
|
|
|
def __download_image(self, img_src, host, base_url):
|
|
parsed = urlparse(img_src)
|
|
|
|
logging.debug("Downloading image: %s", img_src)
|
|
|
|
if len(parsed.netloc) == 0 and parsed.path != "/":
|
|
# relative url, append base_url
|
|
img_src = os.path.join(os.path.dirname(base_url), parsed.path)
|
|
|
|
parsed = urlparse(img_src)
|
|
|
|
# Check if the image is located on the same server
|
|
if len(parsed.netloc) == 0 or parsed.netloc == host:
|
|
same_host = True
|
|
img_host = host
|
|
img_path = parsed.path
|
|
else:
|
|
same_host = False
|
|
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
|
|
|
|
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
|
|
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
|
message += "Host: {host}\r\n\r\n".format(host=host)
|
|
message = message.encode(FORMAT)
|
|
|
|
if same_host:
|
|
client = self.client
|
|
client.reset_request()
|
|
else:
|
|
client = HTTPClient(img_src)
|
|
client.connect((img_host, 80))
|
|
client.sendall(message)
|
|
filename = self._handle_sub_request(client, img_host + img_path)
|
|
|
|
if not same_host:
|
|
client.close()
|
|
|
|
return filename
|