This commit is contained in:
2021-03-24 16:35:12 +01:00
parent 9ba7a030a7
commit d14252f707
10 changed files with 325 additions and 185 deletions

View File

@@ -3,7 +3,7 @@ import argparse
import logging
import sys
from client.command import AbstractCommand
from client import command as cmd
def main():
@@ -18,7 +18,7 @@ def main():
logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose))
logging.debug("Arguments: %s", arguments)
command = AbstractCommand.create(arguments.command, arguments.URI, arguments.port)
command = cmd.create(arguments.command, arguments.URI, arguments.port)
command.execute()

View File

@@ -1,16 +1,39 @@
import logging
from abc import ABC, abstractmethod
from typing import Dict, Tuple
from urllib.parse import urlparse
from client.response_handler import ResponseHandler
from client.httpclient import FORMAT, HTTPClient
from httplib import parser
from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding
from httplib.message import Message
from httplib.retriever import PreambleRetriever
sockets: Dict[str, HTTPClient] = {}
def create(command: str, url: str, port):
if command == "GET":
return GetCommand(url, port)
elif command == "HEAD":
return HeadCommand(url, port)
elif command == "POST":
return PostCommand(url, port)
elif command == "PUT":
return PutCommand(url, port)
else:
raise ValueError()
class AbstractCommand(ABC):
uri: str
host: str
path: str
port: Tuple[str, int]
def __init__(self, url: str, port: str):
self.url = url
def __init__(self, uri: str, port):
self.uri = uri
self.host, _, self.path = parser.parse_uri(uri)
self.port = port
@property
@@ -18,20 +41,6 @@ class AbstractCommand(ABC):
def command(self):
pass
@staticmethod
def create(command: str, url: str, port: str):
if command == "GET":
return GetCommand(url, port)
elif command == "HEAD":
return HeadCommand(url, port)
elif command == "POST":
return PostCommand(url, port)
elif command == "PUT":
return PutCommand(url, port)
else:
raise ValueError()
@staticmethod
def build_message(command, host, path):
message = f"{command} {path} HTTP/1.1\r\n"
@@ -40,26 +49,34 @@ class AbstractCommand(ABC):
return message.encode(FORMAT)
def execute(self):
def execute(self, sub_request=False):
(host, path) = self.parse_uri()
client = HTTPClient(host)
client.conn.connect((host, int(self.port)))
client = sockets.get(host)
if client and client.is_closed():
sockets.pop(self.host)
client = None
if not client:
client = HTTPClient(host)
client.conn.connect((host, self.port))
sockets[host] = client
message = f"{self.command} {path} HTTP/1.1\r\n"
message += f"Host: {host}\r\n"
message += f"Host: {host}:{self.port}\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
encoded_msg = self._build_message(message)
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
logging.debug("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
logging.debug("Sending HTTP message: %r", encoded_msg)
client.conn.sendall(encoded_msg)
logging.info("HTTP request sent, awaiting response...")
try:
self._await_response(client)
retriever = PreambleRetriever(client)
self._await_response(client, retriever)
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
return
@@ -69,9 +86,10 @@ class AbstractCommand(ABC):
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
finally:
client.close()
if not sub_request:
client.close()
def _await_response(self, client):
def _await_response(self, client, retriever):
while True:
line = client.read_line()
print(line, end="")
@@ -82,11 +100,11 @@ class AbstractCommand(ABC):
return (message + "\r\n").encode(FORMAT)
def parse_uri(self):
parsed = urlparse(self.url)
parsed = urlparse(self.uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + self.url)
parsed = urlparse("//" + self.uri)
host = parsed.netloc
path = parsed.path
@@ -105,6 +123,7 @@ class AbstractWithBodyCommand(AbstractCommand, ABC):
@staticmethod
def build_message(command, host, path):
message = AbstractCommand.build_message()
def _build_message(self, message: str) -> bytes:
body = input(f"Enter {self.command} data: ").encode(FORMAT)
print()
@@ -126,18 +145,31 @@ class HeadCommand(AbstractCommand):
class GetCommand(AbstractCommand):
def __init__(self, uri: str, port, dir=None):
super().__init__(uri, port)
self.dir = dir
self.filename = None
@property
def command(self):
return "GET"
def _await_response(self, client):
(version, status, msg) = parser.get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = parser.get_headers(client)
logging.debug("Parsed headers: %r", headers)
def _get_preamble(self, retriever):
lines = retriever.retrieve()
(version, status, msg) = parser.parse_status_line(next(lines))
headers = parser.parse_headers(lines)
handler = ResponseHandler.create(client, headers, status, self.url)
handler.handle()
logging.debug("---response begin---\r\n%s--- response end---", "".join(retriever.buffer))
return Message(version, status, msg, headers)
def _await_response(self, client, retriever) -> str:
msg = self._get_preamble(retriever)
from client import response_handler
self.filename = response_handler.handle(client, msg, self, self.dir)
return
class PostCommand(AbstractWithBodyCommand):

6
client/htmlparser.py Normal file
View File

@@ -0,0 +1,6 @@
from bs4 import BeautifulSoup
class HTMLParser:
def __init__(self, soup: BeautifulSoup):
pass

View File

@@ -2,52 +2,57 @@ import logging
import os
import re
from abc import ABC, abstractmethod
from typing import Dict
from urllib.parse import urlparse, unquote
from urllib.parse import urlsplit, unquote
import cssutils
from bs4 import BeautifulSoup, Tag
from client.command import AbstractCommand, GetCommand
from client.httpclient import HTTPClient, FORMAT
from httplib import parser
from httplib.exceptions import InvalidResponse
from httplib.message import Message
from httplib.retriever import Retriever
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None):
handler = BasicResponseHandler(client, msg, command)
retriever = handler.handle()
if retriever is None:
return
content_type = msg.headers.get("content-type")
if content_type and "text/html" in content_type:
handler = HTMLDownloadHandler(retriever, client, msg, command, dir)
else:
handler = RawDownloadHandler(retriever, client, msg, command, dir)
return handler.handle()
class ResponseHandler(ABC):
client: HTTPClient
headers: Dict[str, str]
status_code: int
url: str
retriever: Retriever
msg: Message
cmd: AbstractCommand
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd):
self.client = client
self.headers = headers
self.url = url
self.retriever = retriever
pass
self.msg = msg
self.cmd = cmd
@abstractmethod
def handle(self):
pass
@staticmethod
def create(client: HTTPClient, headers, status_code, url):
retriever = Retriever.create(client, headers)
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, headers, url)
return RawDownloadHandler(retriever, client, headers, url)
@staticmethod
def parse_uri(uri: str):
parsed = urlparse(uri)
parsed = urlsplit(uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + uri)
parsed = urlsplit("//" + uri)
host = parsed.netloc
path = parsed.path
@@ -56,11 +61,79 @@ class ResponseHandler(ABC):
return host, path
class DownloadHandler(ResponseHandler, ABC):
path: str
class BasicResponseHandler(ResponseHandler):
""" Response handler which throws away the body and only shows the headers.
In case of a redirect, it will process it and pass it to the appropriate response handler.
"""
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url)
def __init__(self, client: HTTPClient, msg: Message, cmd: AbstractCommand):
retriever = Retriever.create(client, msg.headers)
super().__init__(retriever, client, msg, cmd)
def handle(self):
return self._handle_status()
def _skip_body(self):
logging.debug("Skipping body: [")
for line in self.retriever.retrieve():
try:
logging.debug("%s", line.decode(FORMAT))
except Exception:
logging.debug("%r", line)
logging.debug("] done.")
def _handle_status(self):
logging.info("%d %s", self.msg.status, self.msg.msg)
if self.msg.status == 101:
# Switching protocols is not supported
print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
print(self.msg.headers)
return
if 200 <= self.msg.status < 300:
return self.retriever
if 300 <= self.msg.status < 400:
# Redirect
return self._do_handle_redirect()
if 400 <= self.msg.status < 500:
# Dump headers and exit with error
print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
print(self.msg.headers)
return None
def _do_handle_redirect(self):
self._skip_body()
location = self.msg.headers.get("location")
if not location:
raise InvalidResponse("No location in redirect")
parsed_location = urlsplit(location)
if not parsed_location.hostname:
raise InvalidResponse("Invalid location")
if not parsed_location.scheme == "http":
raise InvalidResponse("Only http is supported")
self.cmd.uri = location
self.cmd.host, self.cmd.port, self.cmd.path = parser.parse_uri(location)
if self.msg.status == 301:
logging.info("Status 301. Closing socket [%s]", self.cmd.host)
self.client.close()
self.cmd.execute()
return None
class DownloadHandler(ResponseHandler, ABC):
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
super().__init__(retriever, client, msg, cmd)
if not dir:
dir = self._create_directory()
@@ -68,11 +141,11 @@ class DownloadHandler(ResponseHandler, ABC):
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
@staticmethod
def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
content_type = headers.get("content-type")
def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
content_type = msg.headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, headers, url, dir)
return RawDownloadHandler(retriever, client, headers, url, dir)
return HTMLDownloadHandler(retriever, client, msg, cmd, dir)
return RawDownloadHandler(retriever, client, msg, cmd, dir)
def _create_directory(self):
path = self._get_duplicate_name(os.path.abspath(self.client.host))
@@ -91,54 +164,25 @@ class DownloadHandler(ResponseHandler, ABC):
def get_filename(self):
"""Returns the filename to download the payload to.
"""
filename = "index.html"
parsed = urlparse(self.url)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + self.url)
# If the path contains a `/` get only the last part and use it as filename
# If the path end with a `/`, it's a directory so ignore it.
if len(parsed.path) != 0:
index = parsed.path.rfind("/")
if index == -1:
filename = parsed.path
elif parsed.path[-1] != "/":
filename = parsed.path[index:]
filename = os.path.basename(self.cmd.path)
if filename == '':
return "index.html"
while "%" in filename:
filename = unquote(filename)
filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
result = os.path.basename(filename).strip()
if any(letter.isalnum() for letter in result):
return result
return "index.html"
def _handle_sub_request(self, client, url):
(version, status, _) = parser.get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = parser.get_headers(client)
logging.debug("Parsed headers: %r", headers)
if status != 200:
raise InvalidResponse("Status not expected 200: " + str(status))
retriever = Retriever.create(client, headers)
handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
return handler.handle()
class RawDownloadHandler(DownloadHandler):
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url, dir)
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
super().__init__(retriever, client, msg, cmd, dir)
def handle(self) -> str:
logging.debug("Retrieving payload")
@@ -152,8 +196,8 @@ class RawDownloadHandler(DownloadHandler):
class HTMLDownloadHandler(DownloadHandler):
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url, dir)
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
super().__init__(retriever, client, msg, cmd, dir)
def handle(self) -> str:
@@ -172,11 +216,11 @@ class HTMLDownloadHandler(DownloadHandler):
def _download_images(self, tmp_filename, target_filename):
(host, path) = ResponseHandler.parse_uri(self.url)
(host, path) = ResponseHandler.parse_uri(self.cmd.uri)
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'lxml')
base_url = self.url
base_url = self.cmd.uri
base_element = soup.find("base")
if base_element:
@@ -186,58 +230,24 @@ class HTMLDownloadHandler(DownloadHandler):
tag: Tag
for tag in soup.find_all("img"):
try:
if tag.has_attr("src"):
el_name = "src"
elif tag.has_attr("data-src"):
el_name = "data-src"
else:
if not tag.has_attr("src"):
continue
if tag[el_name] in processed:
new_url = processed.get(tag[el_name])
if tag["src"] in processed:
new_url = processed.get(tag["src"])
else:
new_url = self.__download_image(tag[el_name], host, base_url)
processed[tag[el_name]] = new_url
new_url = self.__download_image(tag["src"], host, base_url)
processed[tag["src"]] = new_url
if new_url:
tag[el_name] = new_url
tag["src"] = new_url
except Exception as e:
logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e)
for tag in soup.find_all("div"):
if not tag.has_attr("style"):
continue
style = cssutils.parseStyle(tag["style"])
if "background" in style and "url(" in style["background"]:
el_name = "background"
elif "background-image" in style and "url(" in style["background-image"]:
el_name = "background-image"
else:
continue
el = style[el_name]
start = el.find("url(") + 4
end = el.find(")", start)
url = el[start:end].strip()
try:
if url in processed:
new_url = url
else:
new_url = self.__download_image(url, host, base_url)
processed[url] = new_url
if new_url:
el = el[:start] + new_url + el[end:]
style[el_name] = el
tag["style"] = style.cssText
except Exception as e:
logging.debug("Internal error", exc_info=e)
logging.error("Failed to download image: %s, skipping...", tag["src"])
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
with open(target_filename, 'w') as file:
file.write(str(soup))
def __download_image(self, img_src, host, base_url):
parsed = urlparse(img_src)
parsed = urlsplit(img_src)
logging.debug("Downloading image: %s", img_src)
@@ -245,36 +255,18 @@ class HTMLDownloadHandler(DownloadHandler):
# Not a valid url
return None
if parsed.hostname == host:
port = self.cmd.port
elif ":" in parsed.netloc:
port = parsed.netloc.split(":", 1)[1]
else:
port = 80
if len(parsed.netloc) == 0 and parsed.path != "/":
# relative url, append base_url
img_src = os.path.join(os.path.dirname(base_url), parsed.path)
parsed = urlparse(img_src)
command = GetCommand(img_src, port, os.path.dirname(self.path))
command.execute(True)
# Check if the image is located on the same server
if len(parsed.netloc) == 0 or parsed.netloc == host:
same_host = True
img_host = host
img_path = parsed.path
else:
same_host = False
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
message = f"GET {img_path} HTTP/1.1\r\n"
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += f"Host: {img_host}\r\n\r\n"
message = message.encode(FORMAT)
if same_host:
client = self.client
client.reset_request()
else:
client = HTTPClient(img_src)
client.conn.connect((img_host, 80))
client.conn.sendall(message)
filename = self._handle_sub_request(client, img_host + img_path)
if not same_host:
client.close()
return filename
return command.filename

View File

@@ -1,6 +1,7 @@
import logging
import socket
from io import BufferedReader
from typing import Tuple
BUFSIZE = 4096
TIMEOUT = 3
@@ -11,7 +12,7 @@ MAXLINE = 4096
class HTTPSocket:
host: str
conn: socket.socket
file: BufferedReader
file: Tuple[BufferedReader, None]
def __init__(self, conn: socket.socket, host: str):
@@ -24,8 +25,12 @@ class HTTPSocket:
def close(self):
self.file.close()
# self.conn.shutdown(socket.SHUT_RDWR)
self.conn.close()
def is_closed(self):
return self.file is None
def reset_request(self):
self.file.close()
self.file = self.conn.makefile("rb")

16
httplib/message.py Normal file
View File

@@ -0,0 +1,16 @@
from typing import Dict
class Message:
version: str
status: int
msg: str
headers: Dict[str, str]
body: bytes
def __init__(self, version: str, status: int, msg: str, headers: Dict[str, str], body: bytes = None):
self.version = version
self.status = status
self.msg = msg
self.headers = headers
self.body = body

View File

@@ -1,6 +1,6 @@
import logging
import re
from urllib.parse import urlparse
from urllib.parse import urlparse, urlsplit
from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest
from httplib.httpsocket import HTTPSocket
@@ -42,6 +42,26 @@ def get_status_line(client: HTTPSocket):
return version, status, reason
def parse_status_line(line: str):
split = list(filter(None, line.strip().split(" ", 2)))
if len(split) < 3:
raise InvalidStatusLine(line) # TODO fix exception
(http_version, status, reason) = split
if not _is_valid_http_version(http_version):
raise InvalidStatusLine(line)
version = http_version[:4]
if not re.match(r"\d{3}", status):
raise InvalidStatusLine(line)
status = int(status)
if status < 100 or status > 999:
raise InvalidStatusLine(line)
return version, status, reason
def parse_request_line(client: HTTPSocket):
line, (method, target, version) = _get_start_line(client)
@@ -119,7 +139,7 @@ def parse_request_headers(client: HTTPSocket):
raise BadRequest()
headers[key] = value
return headers
@@ -157,6 +177,38 @@ def get_headers(client: HTTPSocket):
return result
def parse_headers(lines):
headers = []
# first header after the status-line may not contain a space
for line in lines:
line = next(lines)
if line[0].isspace():
continue
else:
break
for line in lines:
if line in ("\r\n", "\n", " "):
break
if line[0].isspace():
headers[-1] = headers[-1].rstrip("\r\n")
headers.append(line.lstrip())
result = {}
header_str = "".join(headers)
for line in header_str.splitlines():
pos = line.find(":")
if pos <= 0 or pos >= len(line) - 1:
continue
(header, value) = map(str.strip, line.split(":", 1))
check_next_header(result, header, value)
result[header.lower()] = value.lower()
return result
def check_next_header(headers, next_header: str, next_value: str):
if next_header == "content-length":
@@ -166,3 +218,25 @@ def check_next_header(headers, next_header: str, next_value: str):
if not next_value.isnumeric() or int(next_value) <= 0:
logging.error("Invalid content-length value: %r", next_value)
raise InvalidResponse()
def parse_uri(uri: str):
parsed = urlsplit(uri)
# If there is no netloc, the given string is not a valid URI, so split on /
if parsed.hostname:
host = parsed.hostname
path = parsed.path
if parsed.query != '':
path = f"{path}?{parsed.query}"
else:
(host, path) = uri.split("/", 1)
if ":" in host:
host, port = host.split(":", 1)
elif parsed.scheme == "https":
port = 443
else:
port = 80
return host, port, path

View File

@@ -42,6 +42,28 @@ class Retriever(ABC):
return ContentLengthRetriever(client, int(content_length))
class PreambleRetriever(Retriever):
client: HTTPSocket
buffer: []
def __init__(self, client: HTTPSocket):
super().__init__(client)
self.client = client
self.buffer = []
def retrieve(self):
line = self.client.read_line()
while True:
self.buffer.append(line)
if line in ("\r\n", "\n", " "):
break
yield line
line = self.client.read_line()
class ContentLengthRetriever(Retriever):
length: int
@@ -63,21 +85,16 @@ class ContentLengthRetriever(Retriever):
buffer = self.client.read(remaining)
except TimeoutError:
logging.error("Timed out before receiving complete payload")
self.client.close()
raise IncompleteResponse("Timed out before receiving complete payload")
except ConnectionError:
logging.error("Timed out before receiving complete payload")
self.client.close()
raise IncompleteResponse("Connection closed before receiving complete payload")
logging.debug("Received payload length: %s", len(buffer))
if len(buffer) == 0:
logging.warning("Received payload length %s less than expected %s", cur_payload_size, self.length)
break
cur_payload_size += len(buffer)
logging.debug("Processed payload: %r", cur_payload_size)
yield buffer
return b""
@@ -108,7 +125,6 @@ class ChunkedRetriever(Retriever):
yield buffer
self.client.read_line() # remove CRLF
return b""
def __get_chunk_size(self):
line = self.client.read_line()

View File

@@ -1,5 +1,4 @@
import logging
import multiprocessing
import multiprocessing as mp
import threading
from concurrent.futures import ThreadPoolExecutor
@@ -69,7 +68,7 @@ class Worker:
handler = RequestHandler(conn, self.host)
handler.listen()
except Exception as e:
except Exception:
logging.debug("Internal error")
conn.shutdown(socket.SHUT_RDWR)