Client update

This commit is contained in:
2021-03-19 03:29:35 +01:00
parent 1966a174bb
commit 797cdb0c0e
4 changed files with 425 additions and 37 deletions

View File

@@ -1,13 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import logging import logging
import sys
import socket
import re import re
import socket
import sys
import time import time
import os from urllib.parse import urlparse
from client.ResponseHandler import ResponseHandler from client import ResponseHandler
from client.httpclient import HTTPClient
FORMAT = 'utf-8' FORMAT = 'utf-8'
BUFSIZE = 4096 BUFSIZE = 4096
@@ -125,22 +126,7 @@ def get_chunk(buffer: bytes):
return buffer[:split_start], buffer[split_end:] return buffer[:split_start], buffer[split_end:]
def get_html_filename(headers):
if "CONTENT-LOCATION" not in headers:
return "index.html"
filename = headers["CONTENT-LOCATION"]
result = os.path.basename(filename).strip()
if len(result.strip()) == 0:
return 'index.html'
return result
def response_parser(client: socket.socket): def response_parser(client: socket.socket):
client.settimeout(3.0)
try: try:
buffer = client.recv(BUFSIZE) buffer = client.recv(BUFSIZE)
except TimeoutError as err: except TimeoutError as err:
@@ -165,7 +151,7 @@ def response_parser(client: socket.socket):
if payload_size == 0: if payload_size == 0:
return return
filename = get_html_filename(headers) filename = util.get_html_filename(headers)
f = open(filename, "wb") f = open(filename, "wb")
f.write(buffer) f.write(buffer)
@@ -199,6 +185,20 @@ def http_parser(client: socket.socket):
logging.debug("chunk: %r", chunk) logging.debug("chunk: %r", chunk)
def parse_uri(uri: str):
parsed = urlparse(uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + uri)
host = parsed.netloc
path = parsed.path
if len(path) == 0 or path[0] != '/':
path = "/" + path
return host, path
def main(): def main():
parser = argparse.ArgumentParser(description='HTTP Client') parser = argparse.ArgumentParser(description='HTTP Client')
parser.add_argument("--verbose", "-v", action='count', default=0, help="Increase verbosity level of logging") parser.add_argument("--verbose", "-v", action='count', default=0, help="Increase verbosity level of logging")
@@ -211,13 +211,19 @@ def main():
logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose)) logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose))
logging.debug("Arguments: %s", arguments) logging.debug("Arguments: %s", arguments)
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) (host, path) = parse_uri(arguments.URI)
client.connect((arguments.URI, arguments.port)) client = HTTPClient(host)
client.connect((host, arguments.port))
message = "GET /Protocols/HTTP/Performance/microscape/ HTTP/1.1\r\nHost: www.w3.org:80\r\n\r\n".encode(FORMAT) message = "GET {path} HTTP/1.1\r\n".format(path=path)
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "Host: {host}\r\n\r\n".format(host=host)
message = message.encode(FORMAT)
logging.debug("Sending HTTP message: %r", message)
client.sendall(message) client.sendall(message)
ResponseHandler.handle(client, arguments.URI)
response_parser(client) # response_parser(client)
# http_parser(client) # http_parser(client)
# tmp = b'' # tmp = b''
# keep = False # keep = False

View File

@@ -1,26 +1,110 @@
import logging
import os import os
from socket import socket
from typing import Dict from typing import Dict
from urllib.parse import urlparse from urllib.parse import urlparse
from bs4 import BeautifulSoup
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
def handle(client: HTTPClient, url: str):
logging.debug("Waiting for response")
try:
buffer = client.receive()
except TimeoutError:
print("[ABRT] Response timed out")
return
try:
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
(status_line, headers) = client.parse_headers(header_chunk)
client.validate_status_line(status_line)
status_code = int(status_line.split(" ")[1])
response_handler = construct(client, headers, status_code, url)
response_handler.handle(buffer)
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
print("[ABRT] Invalid response")
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
print("[ABRT] Invalid response")
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
print("[ABRT] Invalid response")
return
def construct(client: HTTPClient, headers, status_code, url):
# only chunked transfer-encoding is supported
transfer_encoding = headers.get("transfer-encoding")
if transfer_encoding and transfer_encoding != "chunked":
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
chunked = transfer_encoding
# content-encoding is not supported
content_encoding = headers.get("content-encoding")
if content_encoding:
raise UnsupportedEncoding("content-encoding", content_encoding)
if chunked:
return ChunkedResponseHandler(client, headers, status_code, url)
else:
content_type = headers.get("content-type")
if content_type and "text/html" in content_type:
return HTMLResponseHandler(client, headers, status_code, url)
return PlainResponseHandler(client, headers, status_code, url)
def parse_uri(uri: str):
parsed = urlparse(uri)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "":
parsed = urlparse("//" + uri)
host = parsed.netloc
path = parsed.path
if len(path) == 0 or path[0] != '/':
path = "/" + path
return host, path
class ResponseHandler: class ResponseHandler:
client: socket client: HTTPClient
url: str
headers: Dict[str, str] headers: Dict[str, str]
status_code: int
url: str
def __init__(self, url: str, client: socket): def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str):
self.headers = {}
self.url = url
self.client = client self.client = client
self.headers = headers
self.status_code = status_code
self.url = url
pass pass
def get_html_filename(self): def handle(self, buffer: bytes):
pass
def get_filename(self):
"""Returns the filename to download the payload to.
"""
filename = "index.html" filename = "index.html"
parsed = urlparse(self.url) parsed = urlparse(self.url)
# If there is no netloc, the url is invalid, so prepend `//` and try again
if parsed.netloc == "": if parsed.netloc == "":
parsed = urlparse("//" + self.url) parsed = urlparse("//" + self.url)
# If the path contains a `/` get only the last part and use it as filename
# If the path end with a `/`, it's a directory so ignore it.
if len(parsed.path) != 0: if len(parsed.path) != 0:
index = parsed.path.rfind("/") index = parsed.path.rfind("/")
if index == -1: if index == -1:
@@ -29,14 +113,179 @@ class ResponseHandler:
filename = parsed.path[index:] filename = parsed.path[index:]
result = os.path.basename(filename).strip() result = os.path.basename(filename).strip()
if any(letter.isalnum() for letter in result):
return result return result
return "index.html"
def _handle_download(self, client, url):
logging.debug("Waiting for response")
try:
buffer = client.receive()
except TimeoutError:
print("[ABRT] Response timed out")
return
try:
(header_chunk, buffer) = client.get_crlf_chunk(buffer)
(status_line, headers) = client.parse_headers(header_chunk)
client.validate_status_line(status_line)
status_code = int(status_line.split(" ")[1])
if status_code != 200:
raise InvalidResponse("Code not 200")
response_handler = construct(client, headers, status_code, url)
filename = response_handler.handle(buffer)
return filename
except InvalidResponse as e:
logging.debug("Internal error: Response could not be parsed", exc_info=e)
print("[ABRT] Invalid response")
return
except InvalidStatusLine as e:
logging.debug("Internal error: Invalid status-line in response", exc_info=e)
print("[ABRT] Invalid response")
return
except UnsupportedEncoding as e:
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
print("[ABRT] Invalid response")
return
class PlainResponseHandler(ResponseHandler): class PlainResponseHandler(ResponseHandler):
def __init__(self, url: str, client: socket): def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(url, client) super().__init__(client, headers, status_code, url)
def _get_payload_size(self):
content_length = self.__get_content_length()
if content_length == 0:
logging.debug("content-length is 0")
return None
payload_size = content_length
if not content_length:
payload_size = -1
logging.debug("No content-length specified")
else:
logging.debug("Expected content-length=%s", payload_size)
return payload_size
def handle(self, buffer: bytes):
payload_size = self._get_payload_size()
if payload_size is None:
return
logging.debug("Retrieving payload")
filename = self.get_filename()
file = open(filename, "wb")
self._retrieve(file, buffer, payload_size)
file.close()
return filename
def _retrieve(self, file, buffer: bytes, payload_size: int):
file.write(buffer)
cur_payload_size = len(buffer)
while cur_payload_size < payload_size:
buffer = self.client.receive()
logging.debug("Received payload length: %s", len(buffer))
if len(buffer) == 0:
logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size)
break
cur_payload_size += len(buffer)
logging.debug("Processed payload: %r", cur_payload_size)
file.write(buffer)
def __get_content_length(self):
content_length = self.headers.get("content-length")
if not content_length:
return None
return int(content_length)
class HTMLResponseHandler(PlainResponseHandler):
def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(client, headers, status_code, url)
def handle(self, buffer: bytes):
payload_size = self._get_payload_size()
if payload_size is None:
return
logging.debug("Retrieving payload")
filename = self.get_filename()
tmp_filename = "." + filename + ".tmp"
file = open(tmp_filename, "wb")
self._retrieve(file, buffer, payload_size)
file.close()
self.__download_images(tmp_filename, filename)
os.remove(tmp_filename)
return filename
def __download_images(self, tmp_filename, target_filename):
(host, path) = parse_uri(self.url)
with open(tmp_filename, "r") as fp:
soup = BeautifulSoup(fp, "lxml")
for tag in soup.find_all("img"):
try:
tag["src"] = self.__download_image(tag["src"], host, path)
except Exception as e:
logging.error("Failed to download image, skipping...", exc_info=e)
with open(target_filename, 'w') as file:
file.write(str(soup))
def __download_image(self, img_src, host, path):
parsed = urlparse(img_src)
same_host = True
if len(parsed.netloc) == 0 or parsed.netloc == host:
img_host = host
if parsed.path[0] != "/":
base = os.path.split(path)[0]
if base[-1] != '/':
base += "/"
img_path = base + parsed.path
else:
img_path = parsed.path
else:
same_host = False
(img_host, img_path) = parse_uri(img_src)
message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
message += "Host: {host}\r\n\r\n".format(host=host)
message = message.encode(FORMAT)
if same_host:
client = self.client
else:
client = HTTPClient(img_src)
client.connect((img_host, 80))
client.sendall(message)
filename = self._handle_download(client, img_host + img_path)
if not same_host:
client.close()
return filename
class ChunkedResponseHandler(ResponseHandler): class ChunkedResponseHandler(ResponseHandler):
def __init__(self, url: str, client: socket): def __init__(self, client: HTTPClient, headers, status_code, url):
super().__init__(url, client) super().__init__(client, headers, status_code, url)
def handle(self, buffer: bytes):
return None

131
client/httpclient.py Normal file
View File

@@ -0,0 +1,131 @@
import logging
import re
import socket
from typing import Dict
BUFSIZE = 4096
TIMEOUT = 3
FORMAT = "UTF-8"
class HTTPClient(socket.socket):
host: str
def __init__(self, host: str):
super().__init__(socket.AF_INET, socket.SOCK_STREAM)
self.settimeout(TIMEOUT)
self.host = host
def _do_receive(self):
if self.fileno() == -1:
raise Exception("Connection closed")
result = self.recv(BUFSIZE)
return result
def receive(self):
"""Receive data from the client up to BUFSIZE
"""
count = 0
while True:
count += 1
try:
return self._do_receive()
except socket.timeout:
logging.debug("Socket receive timed out after %s seconds", TIMEOUT)
if count == 3:
break
logging.debug("Retrying %s", count)
logging.debug("Timed out after waiting %s seconds for response", TIMEOUT * count)
raise TimeoutError("Request timed out")
def validate_status_line(self, status_line: str):
split = list(filter(None, status_line.split(" ")))
if len(split) < 3:
return False
# Check HTTP version
http_version = split.pop(0)
if len(http_version) < 8 or http_version[4] != "/":
raise InvalidStatusLine(status_line)
(name, version) = http_version[:4], http_version[5:]
if name != "HTTP" or not re.match(r"1\.[0|1]", version):
return False
if not re.match(r"\d{3}", split[0]):
return False
return True
def get_crlf_chunk(self, buffer: bytes):
"""Finds the line break type (`CRLF` or `LF`) and splits the specified buffer
when encountering 2 consecutive linebreaks.
Returns a tuple with the first part and the remaining of the buffer.
:param buffer:
:return:
"""
lf_pos = buffer.find(b"\n\n")
crlf_pos = buffer.find(b"\r\n\r\n")
if lf_pos != -1 and lf_pos < crlf_pos:
split_start = lf_pos
split_end = lf_pos + 2
else:
split_start = crlf_pos
split_end = crlf_pos + 4
return buffer[:split_start], buffer[split_end:]
def parse_headers(self, data: bytes):
headers = {}
# decode bytes, split into lines and filter
header_split = list(
filter(lambda l: l is not "" and not l[0].isspace(), map(str.strip, data.decode("utf-8").split("\n"))))
if len(header_split) == 0:
raise InvalidResponse(data)
start_line = header_split.pop(0)
logging.debug("start-line: %r", start_line)
for line in header_split:
pos = line.find(":")
if pos <= 0 or pos >= len(line) - 1:
continue
(header, value) = map(str.strip, line.split(":", 1))
headers[header.lower()] = value.lower()
logging.debug("Parsed headers: %r", headers)
return start_line, headers
class HTTPException(Exception):
""" Base class for HTTP exceptions """
class InvalidResponse(HTTPException):
""" Response message cannot be parsed """
def __init(self, message):
self.message = message
class InvalidStatusLine(HTTPException):
""" Response status line is invalid """
def __init(self, line):
self.line = line
class UnsupportedEncoding(HTTPException):
""" Reponse Encoding not support """
def __init(self, enc_type, encoding):
self.enc_type = enc_type
self.encoding = encoding

View File

@@ -0,0 +1,2 @@
beautifulsoup4~=4.9.3
lxml==4.6.2