This commit is contained in:
2021-03-24 16:35:12 +01:00
parent 9ba7a030a7
commit d14252f707
10 changed files with 325 additions and 185 deletions

View File

@@ -2,52 +2,57 @@ import logging
import os
import re
from abc import ABC, abstractmethod
from typing import Dict
from urllib.parse import urlparse, unquote
from urllib.parse import urlsplit, unquote
import cssutils
from bs4 import BeautifulSoup, Tag
from client.command import AbstractCommand, GetCommand
from client.httpclient import HTTPClient, FORMAT
from httplib import parser
from httplib.exceptions import InvalidResponse
from httplib.message import Message
from httplib.retriever import Retriever
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None):
handler = BasicResponseHandler(client, msg, command)
retriever = handler.handle()
if retriever is None:
return
content_type = msg.headers.get("content-type")
if content_type and "text/html" in content_type:
handler = HTMLDownloadHandler(retriever, client, msg, command, dir)
else:
handler = RawDownloadHandler(retriever, client, msg, command, dir)
return handler.handle()
class ResponseHandler(ABC):
client: HTTPClient
headers: Dict[str, str]
status_code: int
url: str
retriever: Retriever
msg: Message
cmd: AbstractCommand
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd):
self.client = client
self.headers = headers
self.url = url
self.retriever = retriever
pass
self.msg = msg
self.cmd = cmd
@abstractmethod
def handle(self):
pass
@staticmethod
def create(client: HTTPClient, headers, status_code, url):
retriever = Retriever.create(client, headers)
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, headers, url)
return RawDownloadHandler(retriever, client, headers, url)
@staticmethod
def parse_uri(uri: str):
parsed = urlparse(uri)
parsed = urlsplit(uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + uri)
parsed = urlsplit("//" + uri)
host = parsed.netloc
path = parsed.path
@@ -56,11 +61,79 @@ class ResponseHandler(ABC):
return host, path
class DownloadHandler(ResponseHandler, ABC):
path: str
class BasicResponseHandler(ResponseHandler):
""" Response handler which throws away the body and only shows the headers.
In case of a redirect, it will process it and pass it to the appropriate response handler.
"""
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url)
def __init__(self, client: HTTPClient, msg: Message, cmd: AbstractCommand):
retriever = Retriever.create(client, msg.headers)
super().__init__(retriever, client, msg, cmd)
def handle(self):
return self._handle_status()
def _skip_body(self):
logging.debug("Skipping body: [")
for line in self.retriever.retrieve():
try:
logging.debug("%s", line.decode(FORMAT))
except Exception:
logging.debug("%r", line)
logging.debug("] done.")
def _handle_status(self):
logging.info("%d %s", self.msg.status, self.msg.msg)
if self.msg.status == 101:
# Switching protocols is not supported
print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
print(self.msg.headers)
return
if 200 <= self.msg.status < 300:
return self.retriever
if 300 <= self.msg.status < 400:
# Redirect
return self._do_handle_redirect()
if 400 <= self.msg.status < 500:
# Dump headers and exit with error
print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
print(self.msg.headers)
return None
def _do_handle_redirect(self):
self._skip_body()
location = self.msg.headers.get("location")
if not location:
raise InvalidResponse("No location in redirect")
parsed_location = urlsplit(location)
if not parsed_location.hostname:
raise InvalidResponse("Invalid location")
if not parsed_location.scheme == "http":
raise InvalidResponse("Only http is supported")
self.cmd.uri = location
self.cmd.host, self.cmd.port, self.cmd.path = parser.parse_uri(location)
if self.msg.status == 301:
logging.info("Status 301. Closing socket [%s]", self.cmd.host)
self.client.close()
self.cmd.execute()
return None
class DownloadHandler(ResponseHandler, ABC):
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
super().__init__(retriever, client, msg, cmd)
if not dir:
dir = self._create_directory()
@@ -68,11 +141,11 @@ class DownloadHandler(ResponseHandler, ABC):
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
@staticmethod
def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
content_type = headers.get("content-type")
def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
content_type = msg.headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, headers, url, dir)
return RawDownloadHandler(retriever, client, headers, url, dir)
return HTMLDownloadHandler(retriever, client, msg, cmd, dir)
return RawDownloadHandler(retriever, client, msg, cmd, dir)
def _create_directory(self):
path = self._get_duplicate_name(os.path.abspath(self.client.host))
@@ -91,54 +164,25 @@ class DownloadHandler(ResponseHandler, ABC):
def get_filename(self):
"""Returns the filename to download the payload to.
"""
filename = "index.html"
parsed = urlparse(self.url)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + self.url)
# If the path contains a `/` get only the last part and use it as filename
# If the path end with a `/`, it's a directory so ignore it.
if len(parsed.path) != 0:
index = parsed.path.rfind("/")
if index == -1:
filename = parsed.path
elif parsed.path[-1] != "/":
filename = parsed.path[index:]
filename = os.path.basename(self.cmd.path)
if filename == '':
return "index.html"
while "%" in filename:
filename = unquote(filename)
filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
result = os.path.basename(filename).strip()
if any(letter.isalnum() for letter in result):
return result
return "index.html"
def _handle_sub_request(self, client, url):
(version, status, _) = parser.get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = parser.get_headers(client)
logging.debug("Parsed headers: %r", headers)
if status != 200:
raise InvalidResponse("Status not expected 200: " + str(status))
retriever = Retriever.create(client, headers)
handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
return handler.handle()
class RawDownloadHandler(DownloadHandler):
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url, dir)
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
super().__init__(retriever, client, msg, cmd, dir)
def handle(self) -> str:
logging.debug("Retrieving payload")
@@ -152,8 +196,8 @@ class RawDownloadHandler(DownloadHandler):
class HTMLDownloadHandler(DownloadHandler):
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url, dir)
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
super().__init__(retriever, client, msg, cmd, dir)
def handle(self) -> str:
@@ -172,11 +216,11 @@ class HTMLDownloadHandler(DownloadHandler):
def _download_images(self, tmp_filename, target_filename):
(host, path) = ResponseHandler.parse_uri(self.url)
(host, path) = ResponseHandler.parse_uri(self.cmd.uri)
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'lxml')
base_url = self.url
base_url = self.cmd.uri
base_element = soup.find("base")
if base_element:
@@ -186,58 +230,24 @@ class HTMLDownloadHandler(DownloadHandler):
tag: Tag
for tag in soup.find_all("img"):
try:
if tag.has_attr("src"):
el_name = "src"
elif tag.has_attr("data-src"):
el_name = "data-src"
else:
if not tag.has_attr("src"):
continue
if tag[el_name] in processed:
new_url = processed.get(tag[el_name])
if tag["src"] in processed:
new_url = processed.get(tag["src"])
else:
new_url = self.__download_image(tag[el_name], host, base_url)
processed[tag[el_name]] = new_url
new_url = self.__download_image(tag["src"], host, base_url)
processed[tag["src"]] = new_url
if new_url:
tag[el_name] = new_url
tag["src"] = new_url
except Exception as e:
logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e)
for tag in soup.find_all("div"):
if not tag.has_attr("style"):
continue
style = cssutils.parseStyle(tag["style"])
if "background" in style and "url(" in style["background"]:
el_name = "background"
elif "background-image" in style and "url(" in style["background-image"]:
el_name = "background-image"
else:
continue
el = style[el_name]
start = el.find("url(") + 4
end = el.find(")", start)
url = el[start:end].strip()
try:
if url in processed:
new_url = url
else:
new_url = self.__download_image(url, host, base_url)
processed[url] = new_url
if new_url:
el = el[:start] + new_url + el[end:]
style[el_name] = el
tag["style"] = style.cssText
except Exception as e:
logging.debug("Internal error", exc_info=e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
with open(target_filename, 'w') as file:
file.write(str(soup))
def __download_image(self, img_src, host, base_url):
parsed = urlparse(img_src)
parsed = urlsplit(img_src)
logging.debug("Downloading image: %s", img_src)
@@ -245,36 +255,18 @@ class HTMLDownloadHandler(DownloadHandler):
# Not a valid url
return None
if parsed.hostname == host:
port = self.cmd.port
elif ":" in parsed.netloc:
port = parsed.netloc.split(":", 1)[1]
else:
port = 80
if len(parsed.netloc) == 0 and parsed.path != "/":
# relative url, append base_url
img_src = os.path.join(os.path.dirname(base_url), parsed.path)
parsed = urlparse(img_src)
command = GetCommand(img_src, port, os.path.dirname(self.path))
command.execute(True)
# Check if the image is located on the same server
if len(parsed.netloc) == 0 or parsed.netloc == host:
same_host = True
img_host = host
img_path = parsed.path
else:
same_host = False
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
message = f"GET {img_path} HTTP/1.1\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += f"Host: {img_host}\r\n\r\n"
message = message.encode(FORMAT)
if same_host:
client = self.client
client.reset_request()
else:
client = HTTPClient(img_src)
client.conn.connect((img_host, 80))
client.conn.sendall(message)
filename = self._handle_sub_request(client, img_host + img_path)
if not same_host:
client.close()
return filename
return command.filename