Files
CN2021/client/response_handler.py
2021-03-22 02:41:49 +01:00

269 lines
8.8 KiB
Python

import logging
import os
from abc import ABC, abstractmethod
from typing import Dict
from urllib.parse import urlparse
import cssutils
from bs4 import BeautifulSoup, Tag
from client.httpclient import HTTPClient, FORMAT
from httplib import parser
from httplib.exceptions import InvalidResponse
from httplib.retriever import Retriever
class ResponseHandler(ABC):
client: HTTPClient
headers: Dict[str, str]
status_code: int
url: str
retriever: Retriever
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
self.client = client
self.headers = headers
self.url = url
self.retriever = retriever
pass
@abstractmethod
def handle(self):
pass
@staticmethod
def create(client: HTTPClient, headers, status_code, url):
retriever = Retriever.create(client, headers)
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, headers, url)
return RawDownloadHandler(retriever, client, headers, url)
@staticmethod
def parse_uri(uri: str):
parsed = urlparse(uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + uri)
host = parsed.netloc
path = parsed.path
if len(path) == 0 or path[0] != '/':
path = "/" + path
return host, path
class DownloadHandler(ResponseHandler, ABC):
path: str
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url)
if not dir:
dir = self._create_directory()
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
@staticmethod
def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, headers, url, dir)
return RawDownloadHandler(retriever, client, headers, url, dir)
def _create_directory(self):
path = self._get_duplicate_name(os.path.abspath(self.client.host))
os.mkdir(path)
return path
def _get_duplicate_name(self, path):
tmp_path = path
i = 0
while os.path.exists(tmp_path):
i += 1
tmp_path = "{path}.{counter}".format(path=path, counter=i)
return tmp_path
def get_filename(self):
"""Returns the filename to download the payload to.
"""
filename = "index.html"
parsed = urlparse(self.url)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + self.url)
# If the path contains a `/` get only the last part and use it as filename
# If the path end with a `/`, it's a directory so ignore it.
if len(parsed.path) != 0:
index = parsed.path.rfind("/")
if index == -1:
filename = parsed.path
elif parsed.path[-1] != "/":
filename = parsed.path[index:]
result = os.path.basename(filename).strip()
if any(letter.isalnum() for letter in result):
return result
return "index.html"
def _handle_sub_request(self, client, url):
(version, status, _) = parser.get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = parser.get_headers(client)
logging.debug("Parsed headers: %r", headers)
if status != 200:
raise InvalidResponse("Status not expected 200: " + str(status))
retriever = Retriever.create(client, headers)
handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
return handler.handle()
class RawDownloadHandler(DownloadHandler):
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url, dir)
def handle(self) -> str:
logging.debug("Retrieving payload")
file = open(self.path, "wb")
for buffer in self.retriever.retrieve():
file.write(buffer)
file.close()
return self.path
class HTMLDownloadHandler(DownloadHandler):
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url, dir)
def handle(self) -> str:
(dir, file) = os.path.split(self.path)
tmp_filename = ".{file}.tmp".format(file=file)
tmp_path = os.path.join(dir, tmp_filename)
file = open(tmp_path, "wb")
for buffer in self.retriever.retrieve():
file.write(buffer)
file.close()
self._download_images(tmp_path, self.path)
os.remove(tmp_path)
return self.path
def _download_images(self, tmp_filename, target_filename):
(host, path) = ResponseHandler.parse_uri(self.url)
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'lxml')
base_url = self.url
base_element = soup.find("base")
if base_element:
base_url = base_element["href"]
processed = {}
tag: Tag
for tag in soup.find_all("img"):
try:
if tag["src"] in processed:
new_url = processed.get(tag["src"])
else:
new_url = self.__download_image(tag["src"], host, base_url)
processed[tag["src"]] = new_url
if new_url:
tag["src"] = new_url
except Exception as e:
logging.debug(e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
for tag in soup.find_all("div"):
if not tag.has_attr("style"):
continue
style = cssutils.parseStyle(tag["style"])
if "background" in style and "url(" in style["background"]:
el_name = "background"
elif "background-image" in style and "url(" in style["background-image"]:
el_name = "background-image"
else:
continue
el = style[el_name]
start = el.find("url(") + 4
end = el.find(")", start)
url = el[start:end].strip()
try:
if url in processed:
new_url = url
else:
new_url = self.__download_image(url, host, base_url)
processed[url] = new_url
if new_url:
el = el[:start] + new_url + el[end:]
style[el_name] = el
tag["style"] = style.cssText
except Exception as e:
logging.debug("Internal error", exc_info=e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
with open(target_filename, 'w') as file:
file.write(str(soup))
def __download_image(self, img_src, host, base_url):
parsed = urlparse(img_src)
logging.debug("Downloading image: %s", img_src)
if parsed.scheme not in ("", "http"):
# Not a valid url
return None
if len(parsed.netloc) == 0 and parsed.path != "/":
# relative url, append base_url
img_src = os.path.join(os.path.dirname(base_url), parsed.path)
parsed = urlparse(img_src)
# Check if the image is located on the same server
if len(parsed.netloc) == 0 or parsed.netloc == host:
same_host = True
img_host = host
img_path = parsed.path
else:
same_host = False
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "Host: {host}\r\n\r\n".format(host=host)
message = message.encode(FORMAT)
if same_host:
client = self.client
client.reset_request()
else:
client = HTTPClient(img_src)
client.conn.connect((img_host, 80))
client.conn.sendall(message)
filename = self._handle_sub_request(client, img_host + img_path)
if not same_host:
client.close()
return filename