Update
This commit is contained in:
@@ -2,52 +2,57 @@ import logging
|
||||
import os
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict
|
||||
from urllib.parse import urlparse, unquote
|
||||
from urllib.parse import urlsplit, unquote
|
||||
|
||||
import cssutils
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from client.command import AbstractCommand, GetCommand
|
||||
from client.httpclient import HTTPClient, FORMAT
|
||||
from httplib import parser
|
||||
from httplib.exceptions import InvalidResponse
|
||||
from httplib.message import Message
|
||||
from httplib.retriever import Retriever
|
||||
|
||||
|
||||
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None):
|
||||
handler = BasicResponseHandler(client, msg, command)
|
||||
retriever = handler.handle()
|
||||
|
||||
if retriever is None:
|
||||
return
|
||||
|
||||
content_type = msg.headers.get("content-type")
|
||||
if content_type and "text/html" in content_type:
|
||||
handler = HTMLDownloadHandler(retriever, client, msg, command, dir)
|
||||
else:
|
||||
handler = RawDownloadHandler(retriever, client, msg, command, dir)
|
||||
|
||||
return handler.handle()
|
||||
|
||||
|
||||
class ResponseHandler(ABC):
|
||||
client: HTTPClient
|
||||
headers: Dict[str, str]
|
||||
status_code: int
|
||||
url: str
|
||||
retriever: Retriever
|
||||
msg: Message
|
||||
cmd: AbstractCommand
|
||||
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd):
|
||||
self.client = client
|
||||
self.headers = headers
|
||||
self.url = url
|
||||
self.retriever = retriever
|
||||
pass
|
||||
self.msg = msg
|
||||
self.cmd = cmd
|
||||
|
||||
@abstractmethod
|
||||
def handle(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def create(client: HTTPClient, headers, status_code, url):
|
||||
retriever = Retriever.create(client, headers)
|
||||
|
||||
content_type = headers.get("content-type")
|
||||
if content_type and "text/html" in content_type:
|
||||
return HTMLDownloadHandler(retriever, client, headers, url)
|
||||
return RawDownloadHandler(retriever, client, headers, url)
|
||||
|
||||
@staticmethod
|
||||
def parse_uri(uri: str):
|
||||
parsed = urlparse(uri)
|
||||
parsed = urlsplit(uri)
|
||||
|
||||
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
||||
if parsed.netloc == "":
|
||||
parsed = urlparse("//" + uri)
|
||||
parsed = urlsplit("//" + uri)
|
||||
|
||||
host = parsed.netloc
|
||||
path = parsed.path
|
||||
@@ -56,11 +61,79 @@ class ResponseHandler(ABC):
|
||||
return host, path
|
||||
|
||||
|
||||
class DownloadHandler(ResponseHandler, ABC):
|
||||
path: str
|
||||
class BasicResponseHandler(ResponseHandler):
|
||||
""" Response handler which throws away the body and only shows the headers.
|
||||
In case of a redirect, it will process it and pass it to the appropriate response handler.
|
||||
"""
|
||||
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
||||
super().__init__(retriever, client, headers, url)
|
||||
def __init__(self, client: HTTPClient, msg: Message, cmd: AbstractCommand):
|
||||
retriever = Retriever.create(client, msg.headers)
|
||||
super().__init__(retriever, client, msg, cmd)
|
||||
|
||||
def handle(self):
|
||||
return self._handle_status()
|
||||
|
||||
def _skip_body(self):
|
||||
logging.debug("Skipping body: [")
|
||||
for line in self.retriever.retrieve():
|
||||
try:
|
||||
logging.debug("%s", line.decode(FORMAT))
|
||||
except Exception:
|
||||
logging.debug("%r", line)
|
||||
|
||||
logging.debug("] done.")
|
||||
|
||||
def _handle_status(self):
|
||||
logging.info("%d %s", self.msg.status, self.msg.msg)
|
||||
|
||||
if self.msg.status == 101:
|
||||
# Switching protocols is not supported
|
||||
print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
|
||||
print(self.msg.headers)
|
||||
return
|
||||
|
||||
if 200 <= self.msg.status < 300:
|
||||
return self.retriever
|
||||
|
||||
if 300 <= self.msg.status < 400:
|
||||
# Redirect
|
||||
return self._do_handle_redirect()
|
||||
if 400 <= self.msg.status < 500:
|
||||
# Dump headers and exit with error
|
||||
print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
|
||||
print(self.msg.headers)
|
||||
return None
|
||||
|
||||
def _do_handle_redirect(self):
|
||||
self._skip_body()
|
||||
|
||||
location = self.msg.headers.get("location")
|
||||
if not location:
|
||||
raise InvalidResponse("No location in redirect")
|
||||
|
||||
parsed_location = urlsplit(location)
|
||||
if not parsed_location.hostname:
|
||||
raise InvalidResponse("Invalid location")
|
||||
|
||||
if not parsed_location.scheme == "http":
|
||||
raise InvalidResponse("Only http is supported")
|
||||
|
||||
self.cmd.uri = location
|
||||
self.cmd.host, self.cmd.port, self.cmd.path = parser.parse_uri(location)
|
||||
|
||||
if self.msg.status == 301:
|
||||
logging.info("Status 301. Closing socket [%s]", self.cmd.host)
|
||||
self.client.close()
|
||||
|
||||
self.cmd.execute()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class DownloadHandler(ResponseHandler, ABC):
|
||||
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
|
||||
super().__init__(retriever, client, msg, cmd)
|
||||
|
||||
if not dir:
|
||||
dir = self._create_directory()
|
||||
@@ -68,11 +141,11 @@ class DownloadHandler(ResponseHandler, ABC):
|
||||
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
|
||||
|
||||
@staticmethod
|
||||
def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
||||
content_type = headers.get("content-type")
|
||||
def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
|
||||
content_type = msg.headers.get("content-type")
|
||||
if content_type and "text/html" in content_type:
|
||||
return HTMLDownloadHandler(retriever, client, headers, url, dir)
|
||||
return RawDownloadHandler(retriever, client, headers, url, dir)
|
||||
return HTMLDownloadHandler(retriever, client, msg, cmd, dir)
|
||||
return RawDownloadHandler(retriever, client, msg, cmd, dir)
|
||||
|
||||
def _create_directory(self):
|
||||
path = self._get_duplicate_name(os.path.abspath(self.client.host))
|
||||
@@ -91,54 +164,25 @@ class DownloadHandler(ResponseHandler, ABC):
|
||||
def get_filename(self):
|
||||
"""Returns the filename to download the payload to.
|
||||
"""
|
||||
filename = "index.html"
|
||||
|
||||
parsed = urlparse(self.url)
|
||||
|
||||
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
||||
if parsed.netloc == "":
|
||||
parsed = urlparse("//" + self.url)
|
||||
|
||||
# If the path contains a `/` get only the last part and use it as filename
|
||||
# If the path end with a `/`, it's a directory so ignore it.
|
||||
if len(parsed.path) != 0:
|
||||
index = parsed.path.rfind("/")
|
||||
if index == -1:
|
||||
filename = parsed.path
|
||||
elif parsed.path[-1] != "/":
|
||||
filename = parsed.path[index:]
|
||||
filename = os.path.basename(self.cmd.path)
|
||||
if filename == '':
|
||||
return "index.html"
|
||||
|
||||
while "%" in filename:
|
||||
filename = unquote(filename)
|
||||
|
||||
filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
|
||||
|
||||
result = os.path.basename(filename).strip()
|
||||
if any(letter.isalnum() for letter in result):
|
||||
return result
|
||||
|
||||
return "index.html"
|
||||
|
||||
def _handle_sub_request(self, client, url):
|
||||
|
||||
(version, status, _) = parser.get_status_line(client)
|
||||
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
|
||||
headers = parser.get_headers(client)
|
||||
logging.debug("Parsed headers: %r", headers)
|
||||
|
||||
if status != 200:
|
||||
raise InvalidResponse("Status not expected 200: " + str(status))
|
||||
|
||||
retriever = Retriever.create(client, headers)
|
||||
handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
|
||||
|
||||
return handler.handle()
|
||||
|
||||
|
||||
class RawDownloadHandler(DownloadHandler):
|
||||
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
||||
super().__init__(retriever, client, headers, url, dir)
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
|
||||
super().__init__(retriever, client, msg, cmd, dir)
|
||||
|
||||
def handle(self) -> str:
|
||||
logging.debug("Retrieving payload")
|
||||
@@ -152,8 +196,8 @@ class RawDownloadHandler(DownloadHandler):
|
||||
|
||||
|
||||
class HTMLDownloadHandler(DownloadHandler):
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
||||
super().__init__(retriever, client, headers, url, dir)
|
||||
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
|
||||
super().__init__(retriever, client, msg, cmd, dir)
|
||||
|
||||
def handle(self) -> str:
|
||||
|
||||
@@ -172,11 +216,11 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
|
||||
def _download_images(self, tmp_filename, target_filename):
|
||||
|
||||
(host, path) = ResponseHandler.parse_uri(self.url)
|
||||
(host, path) = ResponseHandler.parse_uri(self.cmd.uri)
|
||||
with open(tmp_filename, "rb") as fp:
|
||||
soup = BeautifulSoup(fp, 'lxml')
|
||||
|
||||
base_url = self.url
|
||||
base_url = self.cmd.uri
|
||||
base_element = soup.find("base")
|
||||
|
||||
if base_element:
|
||||
@@ -186,58 +230,24 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
tag: Tag
|
||||
for tag in soup.find_all("img"):
|
||||
try:
|
||||
if tag.has_attr("src"):
|
||||
el_name = "src"
|
||||
elif tag.has_attr("data-src"):
|
||||
el_name = "data-src"
|
||||
else:
|
||||
if not tag.has_attr("src"):
|
||||
continue
|
||||
|
||||
if tag[el_name] in processed:
|
||||
new_url = processed.get(tag[el_name])
|
||||
if tag["src"] in processed:
|
||||
new_url = processed.get(tag["src"])
|
||||
else:
|
||||
new_url = self.__download_image(tag[el_name], host, base_url)
|
||||
processed[tag[el_name]] = new_url
|
||||
new_url = self.__download_image(tag["src"], host, base_url)
|
||||
processed[tag["src"]] = new_url
|
||||
if new_url:
|
||||
tag[el_name] = new_url
|
||||
tag["src"] = new_url
|
||||
except Exception as e:
|
||||
logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e)
|
||||
|
||||
for tag in soup.find_all("div"):
|
||||
if not tag.has_attr("style"):
|
||||
continue
|
||||
style = cssutils.parseStyle(tag["style"])
|
||||
|
||||
if "background" in style and "url(" in style["background"]:
|
||||
el_name = "background"
|
||||
elif "background-image" in style and "url(" in style["background-image"]:
|
||||
el_name = "background-image"
|
||||
else:
|
||||
continue
|
||||
el = style[el_name]
|
||||
start = el.find("url(") + 4
|
||||
end = el.find(")", start)
|
||||
url = el[start:end].strip()
|
||||
|
||||
try:
|
||||
if url in processed:
|
||||
new_url = url
|
||||
else:
|
||||
new_url = self.__download_image(url, host, base_url)
|
||||
processed[url] = new_url
|
||||
if new_url:
|
||||
el = el[:start] + new_url + el[end:]
|
||||
style[el_name] = el
|
||||
tag["style"] = style.cssText
|
||||
except Exception as e:
|
||||
logging.debug("Internal error", exc_info=e)
|
||||
logging.error("Failed to download image: %s, skipping...", tag["src"])
|
||||
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
|
||||
|
||||
with open(target_filename, 'w') as file:
|
||||
file.write(str(soup))
|
||||
|
||||
def __download_image(self, img_src, host, base_url):
|
||||
parsed = urlparse(img_src)
|
||||
parsed = urlsplit(img_src)
|
||||
|
||||
logging.debug("Downloading image: %s", img_src)
|
||||
|
||||
@@ -245,36 +255,18 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
# Not a valid url
|
||||
return None
|
||||
|
||||
if parsed.hostname == host:
|
||||
port = self.cmd.port
|
||||
elif ":" in parsed.netloc:
|
||||
port = parsed.netloc.split(":", 1)[1]
|
||||
else:
|
||||
port = 80
|
||||
|
||||
if len(parsed.netloc) == 0 and parsed.path != "/":
|
||||
# relative url, append base_url
|
||||
img_src = os.path.join(os.path.dirname(base_url), parsed.path)
|
||||
|
||||
parsed = urlparse(img_src)
|
||||
command = GetCommand(img_src, port, os.path.dirname(self.path))
|
||||
command.execute(True)
|
||||
|
||||
# Check if the image is located on the same server
|
||||
if len(parsed.netloc) == 0 or parsed.netloc == host:
|
||||
same_host = True
|
||||
img_host = host
|
||||
img_path = parsed.path
|
||||
else:
|
||||
same_host = False
|
||||
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
|
||||
|
||||
message = f"GET {img_path} HTTP/1.1\r\n"
|
||||
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
||||
message += f"Host: {img_host}\r\n\r\n"
|
||||
message = message.encode(FORMAT)
|
||||
|
||||
if same_host:
|
||||
client = self.client
|
||||
client.reset_request()
|
||||
else:
|
||||
client = HTTPClient(img_src)
|
||||
client.conn.connect((img_host, 80))
|
||||
client.conn.sendall(message)
|
||||
filename = self._handle_sub_request(client, img_host + img_path)
|
||||
|
||||
if not same_host:
|
||||
client.close()
|
||||
|
||||
return filename
|
||||
return command.filename
|
||||
|
Reference in New Issue
Block a user