client: fixed GET

This commit is contained in:
2021-03-20 21:45:28 +01:00
parent 797cdb0c0e
commit fa8d08d63d
5 changed files with 341 additions and 142 deletions

View File

@@ -196,6 +196,11 @@ def parse_uri(uri: str):
path = parsed.path
if len(path) == 0 or path[0] != '/':
path = "/" + path
port_pos = host.find(":")
if port_pos >= 0:
host = host[:port_pos]
return host, path
@@ -213,7 +218,7 @@ def main():
(host, path) = parse_uri(arguments.URI)
client = HTTPClient(host)
client.connect((host, arguments.port))
client.connect((host, int(arguments.port)))
message = "GET {path} HTTP/1.1\r\n".format(path=path)
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"

View File

@@ -1,46 +1,110 @@
import logging
import os
import re
from typing import Dict
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from client.Retriever import Retriever
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
def handle(client: HTTPClient, url: str):
logging.debug("Waiting for response")
try:
buffer = client.receive()
except TimeoutError:
print("[ABRT] Response timed out")
return
try:
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
(status_line, headers) = client.parse_headers(header_chunk)
client.validate_status_line(status_line)
status_code = int(status_line.split(" ")[1])
response_handler = construct(client, headers, status_code, url)
response_handler.handle(buffer)
(version, status, _) = get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = get_headers(client)
logging.debug("Parsed headers: %r", headers)
response_handler = construct(client, headers, status, url)
response_handler.handle()
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
print("[ABRT] Invalid response")
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
print("[ABRT] Invalid response")
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
print("[ABRT] Invalid response")
return
def get_status_line(client: HTTPClient):
line = client.read_line()
split = list(filter(None, line.split(" ")))
if len(split) < 3:
raise InvalidStatusLine(line)
# Check HTTP version
http_version = split.pop(0)
if len(http_version) < 8 or http_version[4] != "/":
raise InvalidStatusLine(line)
(name, version) = http_version[:4], http_version[5:]
if name != "HTTP" or not re.match(r"1\.[0|1]", version):
raise InvalidStatusLine(line)
status = split.pop(0)
if not re.match(r"\d{3}", status):
raise InvalidStatusLine(line)
status = int(status)
if status < 100 or status > 999:
raise InvalidStatusLine(line)
reason = split.pop(0)
return version, status, reason
def get_headers(client: HTTPClient):
headers = []
# first header after the status-line may not contain a space
while True:
line = client.read_line()
if line[0].isspace():
continue
else:
break
while True:
if line in ("\r\n", "\n", " "):
break
if line[0].isspace():
headers[-1] = headers[-1].rstrip("\r\n")
headers.append(line.lstrip())
line = client.read_line()
result = {}
header_str = "".join(headers)
for line in header_str.splitlines():
pos = line.find(":")
if pos <= 0 or pos >= len(line) - 1:
continue
(header, value) = map(str.strip, line.split(":", 1))
check_next_header(result, header, value)
result[header.lower()] = value.lower()
return result
def check_next_header(headers, next_header: str, next_value: str):
if next_header == "content-length":
if "content-length" in headers:
logging.error("Multiple content-length headers specified")
raise InvalidResponse()
if not next_value.isnumeric() or int(next_value) <= 0:
logging.error("Invalid content-length value: %r", next_value)
raise InvalidResponse()
def construct(client: HTTPClient, headers, status_code, url):
# only chunked transfer-encoding is supported
transfer_encoding = headers.get("transfer-encoding")
@@ -53,13 +117,12 @@ def construct(client: HTTPClient, headers, status_code, url):
if content_encoding:
raise UnsupportedEncoding("content-encoding", content_encoding)
if chunked:
return ChunkedResponseHandler(client, headers, status_code, url)
else:
retriever = Retriever.create(client, headers)
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLResponseHandler(client, headers, status_code, url)
return PlainResponseHandler(client, headers, status_code, url)
return HTMLDownloadHandler(retriever, client, headers, url)
return RawDownloadHandler(retriever, client, headers, url)
def parse_uri(uri: str):
@@ -81,17 +144,54 @@ class ResponseHandler:
headers: Dict[str, str]
status_code: int
url: str
retriever: Retriever
def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str):
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
self.client = client
self.headers = headers
self.status_code = status_code
self.url = url
self.retriever = retriever
pass
def handle(self, buffer: bytes):
def handle(self):
pass
class DownloadHandler(ResponseHandler):
path: str
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url)
if not dir:
dir = self._create_directory()
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
@staticmethod
def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLDownloadHandler(retriever, client, headers, url, dir)
return RawDownloadHandler(retriever, client, headers, url, dir)
def handle(self) -> str:
pass
def _create_directory(self):
path = self._get_duplicate_name(os.path.abspath(self.client.host))
os.mkdir(path)
return path
def _get_duplicate_name(self, path):
tmp_path = path
i = 0
while os.path.exists(tmp_path):
i += 1
tmp_path = "{path}.{counter}".format(path=path, counter=i)
return tmp_path
def get_filename(self):
"""Returns the filename to download the payload to.
"""
@@ -118,131 +218,68 @@ class ResponseHandler:
return "index.html"
def _handle_download(self, client, url):
logging.debug("Waiting for response")
try:
buffer = client.receive()
except TimeoutError:
print("[ABRT] Response timed out")
return
def _handle_sub_request(self, client, url):
try:
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
(status_line, headers) = client.parse_headers(header_chunk)
client.validate_status_line(status_line)
(version, status, _) = get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = get_headers(client)
logging.debug("Parsed headers: %r", headers)
status_code = int(status_line.split(" ")[1])
if status_code != 200:
raise InvalidResponse("Code not 200")
if status != 200:
raise InvalidResponse("Status not expected 200: " + str(status))
response_handler = construct(client, headers, status_code, url)
filename = response_handler.handle(buffer)
retriever = Retriever.create(client, headers)
handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
return filename
return handler.handle()
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
print("[ABRT] Invalid response")
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
print("[ABRT] Invalid response")
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
print("[ABRT] Invalid response")
return
class RawDownloadHandler(DownloadHandler):
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url, dir)
class PlainResponseHandler(ResponseHandler):
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def _get_payload_size(self):
content_length = self.__get_content_length()
if content_length == 0:
logging.debug("content-length is 0")
return None
payload_size = content_length
if not content_length:
payload_size = -1
logging.debug("No content-length specified")
else:
logging.debug("Expected content-length=%s", payload_size)
return payload_size
def handle(self, buffer: bytes):
payload_size = self._get_payload_size()
if payload_size is None:
return
def handle(self) -> str:
logging.debug("Retrieving payload")
filename = self.get_filename()
file = open(filename, "wb")
self._retrieve(file, buffer, payload_size)
file = open(self.path, "wb")
for buffer in self.retriever.retrieve():
file.write(buffer)
file.close()
return filename
return self.path
def _retrieve(self, file, buffer: bytes, payload_size: int):
class HTMLDownloadHandler(DownloadHandler):
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
super().__init__(retriever, client, headers, url, dir)
def handle(self) -> str:
(dir, file) = os.path.split(self.path)
tmp_filename = ".{file}.tmp".format(file=file)
tmp_path = os.path.join(dir, tmp_filename)
file = open(tmp_path, "wb")
for buffer in self.retriever.retrieve():
file.write(buffer)
cur_payload_size = len(buffer)
while cur_payload_size < payload_size:
buffer = self.client.receive()
logging.debug("Received payload length: %s", len(buffer))
if len(buffer) == 0:
logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size)
break
cur_payload_size += len(buffer)
logging.debug("Processed payload: %r", cur_payload_size)
file.write(buffer)
def __get_content_length(self):
content_length = self.headers.get("content-length")
if not content_length:
return None
return int(content_length)
class HTMLResponseHandler(PlainResponseHandler):
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def handle(self, buffer: bytes):
payload_size = self._get_payload_size()
if payload_size is None:
return
logging.debug("Retrieving payload")
filename = self.get_filename()
tmp_filename = "." + filename + ".tmp"
file = open(tmp_filename, "wb")
self._retrieve(file, buffer, payload_size)
file.close()
self.__download_images(tmp_filename, filename)
os.remove(tmp_filename)
return filename
self.__download_images(tmp_path, self.path)
os.remove(tmp_path)
return self.path
def __download_images(self, tmp_filename, target_filename):
(host, path) = parse_uri(self.url)
with open(tmp_filename, "r") as fp:
soup = BeautifulSoup(fp, "lxml")
with open(tmp_filename, "rb") as fp:
soup = BeautifulSoup(fp, 'html.parser')
for tag in soup.find_all("img"):
try:
tag["src"] = self.__download_image(tag["src"], host, path)
except Exception as e:
logging.error("Failed to download image, skipping...", exc_info=e)
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
with open(target_filename, 'w') as file:
file.write(str(soup))
@@ -250,6 +287,8 @@ class HTMLResponseHandler(PlainResponseHandler):
def __download_image(self, img_src, host, path):
parsed = urlparse(img_src)
logging.debug("Downloading image: %s", img_src)
same_host = True
if len(parsed.netloc) == 0 or parsed.netloc == host:
img_host = host
@@ -271,21 +310,14 @@ class HTMLResponseHandler(PlainResponseHandler):
if same_host:
client = self.client
client.reset_request()
else:
client = HTTPClient(img_src)
client.connect((img_host, 80))
client.sendall(message)
filename = self._handle_download(client, img_host + img_path)
filename = self._handle_sub_request(client, img_host + img_path)
if not same_host:
client.close()
return filename
class ChunkedResponseHandler(ResponseHandler):
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def handle(self, buffer: bytes):
return None

120
client/Retriever.py Normal file
View File

@@ -0,0 +1,120 @@
import logging
from typing import Dict
from client.httpclient import HTTPClient, BUFSIZE, IncompleteResponse, InvalidResponse, UnsupportedEncoding
class Retriever:
client: HTTPClient
headers: Dict[str, str]
def __init__(self, client: HTTPClient):
self.client = client
def retrieve(self):
pass
@staticmethod
def create(client: HTTPClient, headers: Dict[str, str]):
# only chunked transfer-encoding is supported
transfer_encoding = headers.get("transfer-encoding")
if transfer_encoding and transfer_encoding != "chunked":
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
chunked = transfer_encoding
# content-encoding is not supported
content_encoding = headers.get("content-encoding")
if content_encoding:
raise UnsupportedEncoding("content-encoding", content_encoding)
if chunked:
return ChunkedRetriever(client)
else:
content_length = headers.get("content-length")
if not content_length:
logging.warning("Transfer-encoding and content-length not specified, trying without")
return RawRetriever(client)
return ContentLengthRetriever(client, int(content_length))
class ContentLengthRetriever(Retriever):
length: int
def __init__(self, client: HTTPClient, length: int):
super().__init__(client)
self.length = length
def retrieve(self):
cur_payload_size = 0
read_size = BUFSIZE
while cur_payload_size < self.length:
remaining = self.length - cur_payload_size
if remaining < read_size:
read_size = remaining
try:
buffer = self.client.read(remaining)
except TimeoutError:
logging.error("Timed out before receiving complete payload")
self.client.close()
raise IncompleteResponse("Timed out before receiving complete payload")
except ConnectionError:
logging.error("Timed out before receiving complete payload")
self.client.close()
raise IncompleteResponse("Connection closed before receiving complete payload")
logging.debug("Received payload length: %s", len(buffer))
if len(buffer) == 0:
logging.warning("Received payload length %s less than expected %s", cur_payload_size, self.length)
break
cur_payload_size += len(buffer)
logging.debug("Processed payload: %r", cur_payload_size)
yield buffer
return b""
class RawRetriever(Retriever):
def retrieve(self):
while True:
try:
yield self.client.read()
except TimeoutError or ConnectionError:
return b""
class ChunkedRetriever(Retriever):
def retrieve(self):
while True:
chunk_size = self._get_chunk_size()
logging.debug("chunk-size: %s", chunk_size)
if chunk_size == 0:
self.client.reset_request()
break
buffer = self.client.read(chunk_size)
logging.debug("chunk: %r", buffer)
yield buffer
self.client.read_line() # remove CRLF
return b""
def _get_chunk_size(self):
line = self.client.read_line()
sep_pos = line.find(";")
if sep_pos >= 0:
line = line[:sep_pos]
try:
return int(line, 16)
except ValueError:
raise InvalidResponse()

View File

@@ -1,21 +1,35 @@
import logging
import re
import socket
from typing import Dict
from io import BufferedReader
from typing import TextIO, IO
BUFSIZE = 4096
TIMEOUT = 3
FORMAT = "UTF-8"
MAXLINE = 4096
class HTTPClient(socket.socket):
host: str
file: BufferedReader
def __init__(self, host: str):
super().__init__(socket.AF_INET, socket.SOCK_STREAM)
self.settimeout(TIMEOUT)
self.host = host
self.setblocking(True)
self.settimeout(3.0)
self.file = self.makefile("rb")
def close(self):
self.file.close()
super().close()
def reset_request(self):
self.file.close()
self.file = self.makefile("rb")
def _do_receive(self):
if self.fileno() == -1:
@@ -41,6 +55,26 @@ class HTTPClient(socket.socket):
logging.debug("Timed out after waiting %s seconds for response", TIMEOUT * count)
raise TimeoutError("Request timed out")
def read(self, size=BUFSIZE, blocking=True) -> bytes:
if blocking:
return self.file.read(size)
return self.file.read1(size)
def read_line(self):
return str(self.read_bytes_line(), FORMAT)
def read_bytes_line(self):
"""
:rtype: bytes
"""
line = self.file.readline(MAXLINE + 1)
if len(line) > MAXLINE:
raise InvalidResponse("Line too long")
return line
def validate_status_line(self, status_line: str):
split = list(filter(None, status_line.split(" ")))
if len(split) < 3:
@@ -129,3 +163,7 @@ class UnsupportedEncoding(HTTPException):
def __init(self, enc_type, encoding):
self.enc_type = enc_type
self.encoding = encoding
class IncompleteResponse(HTTPException):
def __init(self, cause):
self.cause = cause

View File

@@ -3,6 +3,7 @@
import socket
# socket heeft een listening and accept method
import time
SERVER = "127.0.0.1" #dynamisch fixen in project
PORT = 5055
@@ -26,8 +27,11 @@ def start():
while connected: # while client is connected, we want to recieve messages
msg = conn.recv(HEADER).decode(FORMAT).rstrip() # Argument is maximum size of msg (in project look into details of accp), decode is for converting bytes to strings, rstrip is for stripping messages for special hidden characters
print("message: ", msg)
if msg == DISCONNECT_MESSAGE:
connected = False
for i in range(0,10):
conn.send(b"test")
time.sleep(1)
break
print("close connection ", addr[0], " disconnected.")
conn.close()