update
This commit is contained in:
@@ -1,121 +0,0 @@
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict
|
||||
|
||||
from client.httpclient import HTTPClient, BUFSIZE, IncompleteResponse, InvalidResponse, UnsupportedEncoding
|
||||
|
||||
|
||||
class Retriever(ABC):
|
||||
client: HTTPClient
|
||||
|
||||
def __init__(self, client: HTTPClient):
|
||||
self.client = client
|
||||
|
||||
@abstractmethod
|
||||
def retrieve(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def create(client: HTTPClient, headers: Dict[str, str]):
|
||||
|
||||
# only chunked transfer-encoding is supported
|
||||
transfer_encoding = headers.get("transfer-encoding")
|
||||
if transfer_encoding and transfer_encoding != "chunked":
|
||||
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
|
||||
chunked = transfer_encoding
|
||||
|
||||
# content-encoding is not supported
|
||||
content_encoding = headers.get("content-encoding")
|
||||
if content_encoding:
|
||||
raise UnsupportedEncoding("content-encoding", content_encoding)
|
||||
|
||||
if chunked:
|
||||
return ChunkedRetriever(client)
|
||||
else:
|
||||
content_length = headers.get("content-length")
|
||||
|
||||
if not content_length:
|
||||
logging.warning("Transfer-encoding and content-length not specified, trying without")
|
||||
return RawRetriever(client)
|
||||
|
||||
return ContentLengthRetriever(client, int(content_length))
|
||||
|
||||
|
||||
class ContentLengthRetriever(Retriever):
|
||||
length: int
|
||||
|
||||
def __init__(self, client: HTTPClient, length: int):
|
||||
super().__init__(client)
|
||||
self.length = length
|
||||
|
||||
def retrieve(self):
|
||||
|
||||
cur_payload_size = 0
|
||||
read_size = BUFSIZE
|
||||
while cur_payload_size < self.length:
|
||||
|
||||
remaining = self.length - cur_payload_size
|
||||
if remaining < read_size:
|
||||
read_size = remaining
|
||||
|
||||
try:
|
||||
buffer = self.client.read(remaining)
|
||||
except TimeoutError:
|
||||
logging.error("Timed out before receiving complete payload")
|
||||
self.client.close()
|
||||
raise IncompleteResponse("Timed out before receiving complete payload")
|
||||
except ConnectionError:
|
||||
logging.error("Timed out before receiving complete payload")
|
||||
self.client.close()
|
||||
raise IncompleteResponse("Connection closed before receiving complete payload")
|
||||
|
||||
logging.debug("Received payload length: %s", len(buffer))
|
||||
|
||||
if len(buffer) == 0:
|
||||
logging.warning("Received payload length %s less than expected %s", cur_payload_size, self.length)
|
||||
break
|
||||
|
||||
cur_payload_size += len(buffer)
|
||||
logging.debug("Processed payload: %r", cur_payload_size)
|
||||
yield buffer
|
||||
|
||||
return b""
|
||||
|
||||
|
||||
class RawRetriever(Retriever):
|
||||
|
||||
def retrieve(self):
|
||||
while True:
|
||||
try:
|
||||
yield self.client.read()
|
||||
except TimeoutError or ConnectionError:
|
||||
return b""
|
||||
|
||||
|
||||
class ChunkedRetriever(Retriever):
|
||||
|
||||
def retrieve(self):
|
||||
while True:
|
||||
chunk_size = self.__get_chunk_size()
|
||||
logging.debug("chunk-size: %s", chunk_size)
|
||||
if chunk_size == 0:
|
||||
self.client.reset_request()
|
||||
break
|
||||
|
||||
buffer = self.client.read(chunk_size)
|
||||
logging.debug("chunk: %r", buffer)
|
||||
yield buffer
|
||||
|
||||
self.client.read_line() # remove CRLF
|
||||
return b""
|
||||
|
||||
def __get_chunk_size(self):
|
||||
line = self.client.read_line()
|
||||
sep_pos = line.find(";")
|
||||
if sep_pos >= 0:
|
||||
line = line[:sep_pos]
|
||||
|
||||
try:
|
||||
return int(line, 16)
|
||||
except ValueError:
|
||||
raise InvalidResponse()
|
@@ -2,9 +2,10 @@ import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from client.ResponseHandler import ResponseHandler
|
||||
from client.httpclient import FORMAT, HTTPClient, InvalidResponse, InvalidStatusLine, UnsupportedEncoding
|
||||
|
||||
from client.response_handler import ResponseHandler
|
||||
from client.httpclient import FORMAT, HTTPClient
|
||||
from httplib import parser
|
||||
from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding
|
||||
|
||||
class AbstractCommand(ABC):
|
||||
|
||||
@@ -34,7 +35,7 @@ class AbstractCommand(ABC):
|
||||
(host, path) = self.parse_uri()
|
||||
|
||||
client = HTTPClient(host)
|
||||
client.connect((host, int(self.port)))
|
||||
client.conn.connect((host, int(self.port)))
|
||||
|
||||
message = f"{self.command} {path} HTTP/1.1\r\n"
|
||||
message += f"Host: {host}\r\n"
|
||||
@@ -44,7 +45,7 @@ class AbstractCommand(ABC):
|
||||
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
|
||||
|
||||
logging.debug("Sending HTTP message: %r", encoded_msg)
|
||||
client.sendall(encoded_msg)
|
||||
client.conn.sendall(encoded_msg)
|
||||
|
||||
logging.info("HTTP request sent, awaiting response...")
|
||||
|
||||
@@ -118,9 +119,9 @@ class GetCommand(AbstractCommand):
|
||||
return "GET"
|
||||
|
||||
def _await_response(self, client):
|
||||
(version, status, msg) = ResponseHandler.get_status_line(client)
|
||||
(version, status, msg) = parser.get_status_line(client)
|
||||
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
|
||||
headers = ResponseHandler.get_headers(client)
|
||||
headers = parser.get_headers(client)
|
||||
logging.debug("Parsed headers: %r", headers)
|
||||
|
||||
handler = ResponseHandler.create(client, headers, status, self.url)
|
||||
|
@@ -1,6 +1,6 @@
|
||||
import logging
|
||||
import socket
|
||||
from io import BufferedReader
|
||||
|
||||
from httplib.httpsocket import HTTPSocket
|
||||
|
||||
BUFSIZE = 4096
|
||||
TIMEOUT = 3
|
||||
@@ -8,98 +8,8 @@ FORMAT = "UTF-8"
|
||||
MAXLINE = 4096
|
||||
|
||||
|
||||
class HTTPClient(socket.socket):
|
||||
class HTTPClient(HTTPSocket):
|
||||
host: str
|
||||
file: BufferedReader
|
||||
|
||||
def __init__(self, host: str):
|
||||
|
||||
super().__init__(socket.AF_INET, socket.SOCK_STREAM)
|
||||
self.settimeout(TIMEOUT)
|
||||
self.host = host
|
||||
self.setblocking(True)
|
||||
self.settimeout(3.0)
|
||||
self.file = self.makefile("rb")
|
||||
|
||||
def close(self):
|
||||
self.file.close()
|
||||
super().close()
|
||||
|
||||
def reset_request(self):
|
||||
self.file.close()
|
||||
self.file = self.makefile("rb")
|
||||
|
||||
def __do_receive(self):
|
||||
if self.fileno() == -1:
|
||||
raise Exception("Connection closed")
|
||||
|
||||
result = self.recv(BUFSIZE)
|
||||
return result
|
||||
|
||||
def receive(self):
|
||||
"""Receive data from the client up to BUFSIZE
|
||||
"""
|
||||
count = 0
|
||||
while True:
|
||||
count += 1
|
||||
try:
|
||||
return self.__do_receive()
|
||||
except socket.timeout:
|
||||
logging.debug("Socket receive timed out after %s seconds", TIMEOUT)
|
||||
if count == 3:
|
||||
break
|
||||
logging.debug("Retrying %s", count)
|
||||
|
||||
logging.debug("Timed out after waiting %s seconds for response", TIMEOUT * count)
|
||||
raise TimeoutError("Request timed out")
|
||||
|
||||
def read(self, size=BUFSIZE, blocking=True) -> bytes:
|
||||
if blocking:
|
||||
return self.file.read(size)
|
||||
|
||||
return self.file.read1(size)
|
||||
|
||||
def read_line(self):
|
||||
return str(self.read_bytes_line(), FORMAT)
|
||||
|
||||
def read_bytes_line(self):
|
||||
"""
|
||||
|
||||
:rtype: bytes
|
||||
"""
|
||||
line = self.file.readline(MAXLINE + 1)
|
||||
if len(line) > MAXLINE:
|
||||
raise InvalidResponse("Line too long")
|
||||
|
||||
return line
|
||||
|
||||
|
||||
class HTTPException(Exception):
|
||||
""" Base class for HTTP exceptions """
|
||||
|
||||
|
||||
class InvalidResponse(HTTPException):
|
||||
""" Response message cannot be parsed """
|
||||
|
||||
def __init(self, message):
|
||||
self.message = message
|
||||
|
||||
|
||||
class InvalidStatusLine(HTTPException):
|
||||
""" Response status line is invalid """
|
||||
|
||||
def __init(self, line):
|
||||
self.line = line
|
||||
|
||||
|
||||
class UnsupportedEncoding(HTTPException):
|
||||
""" Reponse Encoding not support """
|
||||
|
||||
def __init(self, enc_type, encoding):
|
||||
self.enc_type = enc_type
|
||||
self.encoding = encoding
|
||||
|
||||
|
||||
class IncompleteResponse(HTTPException):
|
||||
def __init(self, cause):
|
||||
self.cause = cause
|
||||
super().__init__(socket.socket(socket.AF_INET, socket.SOCK_STREAM), host)
|
||||
|
@@ -1,14 +1,15 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from client.Retriever import Retriever
|
||||
from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
|
||||
from client.httpclient import HTTPClient, FORMAT
|
||||
from httplib.retriever import Retriever
|
||||
from httplib import parser
|
||||
from httplib.exceptions import InvalidResponse
|
||||
|
||||
|
||||
class ResponseHandler(ABC):
|
||||
@@ -31,17 +32,6 @@ class ResponseHandler(ABC):
|
||||
|
||||
@staticmethod
|
||||
def create(client: HTTPClient, headers, status_code, url):
|
||||
# only chunked transfer-encoding is supported
|
||||
transfer_encoding = headers.get("transfer-encoding")
|
||||
if transfer_encoding and transfer_encoding != "chunked":
|
||||
raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
|
||||
chunked = transfer_encoding
|
||||
|
||||
# content-encoding is not supported
|
||||
content_encoding = headers.get("content-encoding")
|
||||
if content_encoding:
|
||||
raise UnsupportedEncoding("content-encoding", content_encoding)
|
||||
|
||||
retriever = Retriever.create(client, headers)
|
||||
|
||||
content_type = headers.get("content-type")
|
||||
@@ -49,78 +39,6 @@ class ResponseHandler(ABC):
|
||||
return HTMLDownloadHandler(retriever, client, headers, url)
|
||||
return RawDownloadHandler(retriever, client, headers, url)
|
||||
|
||||
@staticmethod
|
||||
def get_status_line(client: HTTPClient):
|
||||
line = client.read_line()
|
||||
|
||||
split = list(filter(None, line.split(" ")))
|
||||
if len(split) < 3:
|
||||
raise InvalidStatusLine(line)
|
||||
|
||||
# Check HTTP version
|
||||
http_version = split.pop(0)
|
||||
if len(http_version) < 8 or http_version[4] != "/":
|
||||
raise InvalidStatusLine(line)
|
||||
|
||||
(name, version) = http_version[:4], http_version[5:]
|
||||
if name != "HTTP" or not re.match(r"1\.[0|1]", version):
|
||||
raise InvalidStatusLine(line)
|
||||
|
||||
status = split.pop(0)
|
||||
if not re.match(r"\d{3}", status):
|
||||
raise InvalidStatusLine(line)
|
||||
status = int(status)
|
||||
if status < 100 or status > 999:
|
||||
raise InvalidStatusLine(line)
|
||||
|
||||
reason = split.pop(0)
|
||||
return version, status, reason
|
||||
|
||||
@staticmethod
|
||||
def get_headers(client: HTTPClient):
|
||||
headers = []
|
||||
# first header after the status-line may not contain a space
|
||||
while True:
|
||||
line = client.read_line()
|
||||
if line[0].isspace():
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
while True:
|
||||
if line in ("\r\n", "\n", " "):
|
||||
break
|
||||
|
||||
if line[0].isspace():
|
||||
headers[-1] = headers[-1].rstrip("\r\n")
|
||||
|
||||
headers.append(line.lstrip())
|
||||
line = client.read_line()
|
||||
|
||||
result = {}
|
||||
header_str = "".join(headers)
|
||||
for line in header_str.splitlines():
|
||||
pos = line.find(":")
|
||||
|
||||
if pos <= 0 or pos >= len(line) - 1:
|
||||
continue
|
||||
|
||||
(header, value) = map(str.strip, line.split(":", 1))
|
||||
ResponseHandler.check_next_header(result, header, value)
|
||||
result[header.lower()] = value.lower()
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def check_next_header(headers, next_header: str, next_value: str):
|
||||
if next_header == "content-length":
|
||||
if "content-length" in headers:
|
||||
logging.error("Multiple content-length headers specified")
|
||||
raise InvalidResponse()
|
||||
if not next_value.isnumeric() or int(next_value) <= 0:
|
||||
logging.error("Invalid content-length value: %r", next_value)
|
||||
raise InvalidResponse()
|
||||
|
||||
@staticmethod
|
||||
def parse_uri(uri: str):
|
||||
parsed = urlparse(uri)
|
||||
@@ -196,9 +114,9 @@ class DownloadHandler(ResponseHandler, ABC):
|
||||
|
||||
def _handle_sub_request(self, client, url):
|
||||
|
||||
(version, status, _) = self.get_status_line(client)
|
||||
(version, status, _) = parser.get_status_line(client)
|
||||
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
|
||||
headers = self.get_headers(client)
|
||||
headers = parser.get_headers(client)
|
||||
logging.debug("Parsed headers: %r", headers)
|
||||
|
||||
if status != 200:
|
||||
@@ -297,8 +215,8 @@ class HTMLDownloadHandler(DownloadHandler):
|
||||
client.reset_request()
|
||||
else:
|
||||
client = HTTPClient(img_src)
|
||||
client.connect((img_host, 80))
|
||||
client.sendall(message)
|
||||
client.conn.connect((img_host, 80))
|
||||
client.conn.sendall(message)
|
||||
filename = self._handle_sub_request(client, img_host + img_path)
|
||||
|
||||
if not same_host:
|
Reference in New Issue
Block a user