This commit is contained in:
2021-03-21 23:01:09 +01:00
parent 638576f471
commit d25d2ef993
14 changed files with 681 additions and 226 deletions

View File

@@ -1,121 +0,0 @@
import logging
from abc import ABC, abstractmethod
from typing import Dict
from client.httpclient import HTTPClient, BUFSIZE, IncompleteResponse, InvalidResponse, UnsupportedEncoding
class Retriever(ABC):
client: HTTPClient
def __init__(self, client: HTTPClient):
self.client = client
@abstractmethod
def retrieve(self):
pass
@staticmethod
def create(client: HTTPClient, headers: Dict[str, str]):
# only chunked transfer-encoding is supported
transfer_encoding = headers.get("transfer-encoding")
if transfer_encoding and transfer_encoding != "chunked":
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
chunked = transfer_encoding
# content-encoding is not supported
content_encoding = headers.get("content-encoding")
if content_encoding:
raise UnsupportedEncoding("content-encoding", content_encoding)
if chunked:
return ChunkedRetriever(client)
else:
content_length = headers.get("content-length")
if not content_length:
logging.warning("Transfer-encoding and content-length not specified, trying without")
return RawRetriever(client)
return ContentLengthRetriever(client, int(content_length))
class ContentLengthRetriever(Retriever):
length: int
def __init__(self, client: HTTPClient, length: int):
super().__init__(client)
self.length = length
def retrieve(self):
cur_payload_size = 0
read_size = BUFSIZE
while cur_payload_size < self.length:
remaining = self.length - cur_payload_size
if remaining < read_size:
read_size = remaining
try:
buffer = self.client.read(remaining)
except TimeoutError:
logging.error("Timed out before receiving complete payload")
self.client.close()
raise IncompleteResponse("Timed out before receiving complete payload")
except ConnectionError:
logging.error("Timed out before receiving complete payload")
self.client.close()
raise IncompleteResponse("Connection closed before receiving complete payload")
logging.debug("Received payload length: %s", len(buffer))
if len(buffer) == 0:
logging.warning("Received payload length %s less than expected %s", cur_payload_size, self.length)
break
cur_payload_size += len(buffer)
logging.debug("Processed payload: %r", cur_payload_size)
yield buffer
return b""
class RawRetriever(Retriever):
def retrieve(self):
while True:
try:
yield self.client.read()
except TimeoutError or ConnectionError:
return b""
class ChunkedRetriever(Retriever):
def retrieve(self):
while True:
chunk_size = self.__get_chunk_size()
logging.debug("chunk-size: %s", chunk_size)
if chunk_size == 0:
self.client.reset_request()
break
buffer = self.client.read(chunk_size)
logging.debug("chunk: %r", buffer)
yield buffer
self.client.read_line() # remove CRLF
return b""
def __get_chunk_size(self):
line = self.client.read_line()
sep_pos = line.find(";")
if sep_pos >= 0:
line = line[:sep_pos]
try:
return int(line, 16)
except ValueError:
raise InvalidResponse()

View File

@@ -2,9 +2,10 @@ import logging
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from client.ResponseHandler import ResponseHandler
from client.httpclient import FORMAT, HTTPClient, InvalidResponse, InvalidStatusLine, UnsupportedEncoding
from client.response_handler import ResponseHandler
from client.httpclient import FORMAT, HTTPClient
from httplib import parser
from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding
class AbstractCommand(ABC):
@@ -34,7 +35,7 @@ class AbstractCommand(ABC):
(host, path) = self.parse_uri()
client = HTTPClient(host)
client.connect((host, int(self.port)))
client.conn.connect((host, int(self.port)))
message = f"{self.command} {path} HTTP/1.1\r\n"
message += f"Host: {host}\r\n"
@@ -44,7 +45,7 @@ class AbstractCommand(ABC):
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
logging.debug("Sending HTTP message: %r", encoded_msg)
client.sendall(encoded_msg)
client.conn.sendall(encoded_msg)
logging.info("HTTP request sent, awaiting response...")
@@ -118,9 +119,9 @@ class GetCommand(AbstractCommand):
return "GET"
def _await_response(self, client):
(version, status, msg) = ResponseHandler.get_status_line(client)
(version, status, msg) = parser.get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = ResponseHandler.get_headers(client)
headers = parser.get_headers(client)
logging.debug("Parsed headers: %r", headers)
handler = ResponseHandler.create(client, headers, status, self.url)

View File

@@ -1,6 +1,6 @@
import logging
import socket
from io import BufferedReader
from httplib.httpsocket import HTTPSocket
BUFSIZE = 4096
TIMEOUT = 3
@@ -8,98 +8,8 @@ FORMAT = "UTF-8"
MAXLINE = 4096
class HTTPClient(socket.socket):
class HTTPClient(HTTPSocket):
host: str
file: BufferedReader
def __init__(self, host: str):
super().__init__(socket.AF_INET, socket.SOCK_STREAM)
self.settimeout(TIMEOUT)
self.host = host
self.setblocking(True)
self.settimeout(3.0)
self.file = self.makefile("rb")
def close(self):
self.file.close()
super().close()
def reset_request(self):
self.file.close()
self.file = self.makefile("rb")
def __do_receive(self):
if self.fileno() == -1:
raise Exception("Connection closed")
result = self.recv(BUFSIZE)
return result
def receive(self):
"""Receive data from the client up to BUFSIZE
"""
count = 0
while True:
count += 1
try:
return self.__do_receive()
except socket.timeout:
logging.debug("Socket receive timed out after %s seconds", TIMEOUT)
if count == 3:
break
logging.debug("Retrying %s", count)
logging.debug("Timed out after waiting %s seconds for response", TIMEOUT * count)
raise TimeoutError("Request timed out")
def read(self, size=BUFSIZE, blocking=True) -> bytes:
if blocking:
return self.file.read(size)
return self.file.read1(size)
def read_line(self):
return str(self.read_bytes_line(), FORMAT)
def read_bytes_line(self):
"""
:rtype: bytes
"""
line = self.file.readline(MAXLINE + 1)
if len(line) > MAXLINE:
raise InvalidResponse("Line too long")
return line
class HTTPException(Exception):
""" Base class for HTTP exceptions """
class InvalidResponse(HTTPException):
""" Response message cannot be parsed """
def __init(self, message):
self.message = message
class InvalidStatusLine(HTTPException):
""" Response status line is invalid """
def __init(self, line):
self.line = line
class UnsupportedEncoding(HTTPException):
""" Reponse Encoding not support """
def __init(self, enc_type, encoding):
self.enc_type = enc_type
self.encoding = encoding
class IncompleteResponse(HTTPException):
def __init(self, cause):
self.cause = cause
super().__init__(socket.socket(socket.AF_INET, socket.SOCK_STREAM), host)

View File

@@ -1,14 +1,15 @@
import logging
import os
import re
from abc import ABC, abstractmethod
from typing import Dict
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from client.Retriever import Retriever
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
from client.httpclient import HTTPClient, FORMAT
from httplib.retriever import Retriever
from httplib import parser
from httplib.exceptions import InvalidResponse
class ResponseHandler(ABC):
@@ -31,17 +32,6 @@ class ResponseHandler(ABC):
@staticmethod
def create(client: HTTPClient, headers, status_code, url):
# only chunked transfer-encoding is supported
transfer_encoding = headers.get("transfer-encoding")
if transfer_encoding and transfer_encoding != "chunked":
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
chunked = transfer_encoding
# content-encoding is not supported
content_encoding = headers.get("content-encoding")
if content_encoding:
raise UnsupportedEncoding("content-encoding", content_encoding)
retriever = Retriever.create(client, headers)
content_type = headers.get("content-type")
@@ -49,78 +39,6 @@ class ResponseHandler(ABC):
return HTMLDownloadHandler(retriever, client, headers, url)
return RawDownloadHandler(retriever, client, headers, url)
@staticmethod
def get_status_line(client: HTTPClient):
line = client.read_line()
split = list(filter(None, line.split(" ")))
if len(split) < 3:
raise InvalidStatusLine(line)
# Check HTTP version
http_version = split.pop(0)
if len(http_version) < 8 or http_version[4] != "/":
raise InvalidStatusLine(line)
(name, version) = http_version[:4], http_version[5:]
if name != "HTTP" or not re.match(r"1\.[0|1]", version):
raise InvalidStatusLine(line)
status = split.pop(0)
if not re.match(r"\d{3}", status):
raise InvalidStatusLine(line)
status = int(status)
if status < 100 or status > 999:
raise InvalidStatusLine(line)
reason = split.pop(0)
return version, status, reason
@staticmethod
def get_headers(client: HTTPClient):
headers = []
# first header after the status-line may not contain a space
while True:
line = client.read_line()
if line[0].isspace():
continue
else:
break
while True:
if line in ("\r\n", "\n", " "):
break
if line[0].isspace():
headers[-1] = headers[-1].rstrip("\r\n")
headers.append(line.lstrip())
line = client.read_line()
result = {}
header_str = "".join(headers)
for line in header_str.splitlines():
pos = line.find(":")
if pos <= 0 or pos >= len(line) - 1:
continue
(header, value) = map(str.strip, line.split(":", 1))
ResponseHandler.check_next_header(result, header, value)
result[header.lower()] = value.lower()
return result
@staticmethod
def check_next_header(headers, next_header: str, next_value: str):
if next_header == "content-length":
if "content-length" in headers:
logging.error("Multiple content-length headers specified")
raise InvalidResponse()
if not next_value.isnumeric() or int(next_value) <= 0:
logging.error("Invalid content-length value: %r", next_value)
raise InvalidResponse()
@staticmethod
def parse_uri(uri: str):
parsed = urlparse(uri)
@@ -196,9 +114,9 @@ class DownloadHandler(ResponseHandler, ABC):
def _handle_sub_request(self, client, url):
(version, status, _) = self.get_status_line(client)
(version, status, _) = parser.get_status_line(client)
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
headers = self.get_headers(client)
headers = parser.get_headers(client)
logging.debug("Parsed headers: %r", headers)
if status != 200:
@@ -297,8 +215,8 @@ class HTMLDownloadHandler(DownloadHandler):
client.reset_request()
else:
client = HTTPClient(img_src)
client.connect((img_host, 80))
client.sendall(message)
client.conn.connect((img_host, 80))
client.conn.sendall(message)
filename = self._handle_sub_request(client, img_host + img_path)
if not same_host: