Client update

2021-03-19 03:29:35 +01:00
parent 1966a174bb
commit 797cdb0c0e
4 changed files with 425 additions and 37 deletions
--- a/client.py
+++ b/client.py
@@ -1,13 +1,14 @@
 #!/usr/bin/env python3
 import argparse
 import logging
 import sys
 import socket
 import re
 import socket
 import sys
 import time
-import os
+from urllib.parse import urlparse
-from client.ResponseHandler import ResponseHandler
+from client import ResponseHandler
 from client.httpclient import HTTPClient
 FORMAT = 'utf-8'
 BUFSIZE = 4096
@@ -125,22 +126,7 @@ def get_chunk(buffer: bytes):
    return buffer[:split_start], buffer[split_end:]
 def get_html_filename(headers):
    if "CONTENT-LOCATION" not in headers:
        return "index.html"
    filename = headers["CONTENT-LOCATION"]
    result = os.path.basename(filename).strip()
    if len(result.strip()) == 0:
        return 'index.html'
    return result
 def response_parser(client: socket.socket):
    client.settimeout(3.0)
    try:
        buffer = client.recv(BUFSIZE)
    except TimeoutError as err:
@@ -165,7 +151,7 @@ def response_parser(client: socket.socket):
        if payload_size == 0:
            return
-        filename = get_html_filename(headers)
+        filename = util.get_html_filename(headers)
        f = open(filename, "wb")
        f.write(buffer)
@@ -199,6 +185,20 @@ def http_parser(client: socket.socket):
        logging.debug("chunk: %r", chunk)
 def parse_uri(uri: str):
    parsed = urlparse(uri)
    # If there is no netloc, the url is invalid, so prepend `//` and try again
    if parsed.netloc == "":
        parsed = urlparse("//" + uri)
    host = parsed.netloc
    path = parsed.path
    if len(path) == 0 or path[0] != '/':
        path = "/" + path
    return host, path
 def main():
    parser = argparse.ArgumentParser(description='HTTP Client')
    parser.add_argument("--verbose", "-v", action='count', default=0, help="Increase verbosity level of logging")
@@ -211,13 +211,19 @@ def main():
    logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose))
    logging.debug("Arguments: %s", arguments)
-    client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    (host, path) = parse_uri(arguments.URI)
-    client.connect((arguments.URI, arguments.port))
+    client = HTTPClient(host)
    client.connect((host, arguments.port))
-    message = "GET /Protocols/HTTP/Performance/microscape/ HTTP/1.1\r\nHost: www.w3.org:80\r\n\r\n".encode(FORMAT)
+    message = "GET {path} HTTP/1.1\r\n".format(path=path)
    message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
    message += "Host: {host}\r\n\r\n".format(host=host)
    message = message.encode(FORMAT)
    logging.debug("Sending HTTP message: %r", message)
    client.sendall(message)
-
+    ResponseHandler.handle(client, arguments.URI)
-    response_parser(client)
+    # response_parser(client)
    # http_parser(client)
    # tmp = b''
    # keep = False
--- a/client/ResponseHandler.py
+++ b/client/ResponseHandler.py
@@ -1,26 +1,110 @@
 import logging
 import os
 from socket import socket
 from typing import Dict
 from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine
 def handle(client: HTTPClient, url: str):
    logging.debug("Waiting for response")
    try:
        buffer = client.receive()
    except TimeoutError:
        print("[ABRT] Response timed out")
        return
    try:
        (header_chunk, buffer) = client.get_crlf_chunk(buffer)
        (status_line, headers) = client.parse_headers(header_chunk)
        client.validate_status_line(status_line)
        status_code = int(status_line.split(" ")[1])
        response_handler = construct(client, headers, status_code, url)
        response_handler.handle(buffer)
    except InvalidResponse as e:
        logging.debug("Internal error: Response could not be parsed", exc_info=e)
        print("[ABRT] Invalid response")
        return
    except InvalidStatusLine as e:
        logging.debug("Internal error: Invalid status-line in response", exc_info=e)
        print("[ABRT] Invalid response")
        return
    except UnsupportedEncoding as e:
        logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
        print("[ABRT] Invalid response")
        return
 def construct(client: HTTPClient, headers, status_code, url):
    # only chunked transfer-encoding is supported
    transfer_encoding = headers.get("transfer-encoding")
    if transfer_encoding and transfer_encoding != "chunked":
        raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
    chunked = transfer_encoding
    # content-encoding is not supported
    content_encoding = headers.get("content-encoding")
    if content_encoding:
        raise UnsupportedEncoding("content-encoding", content_encoding)
    if chunked:
        return ChunkedResponseHandler(client, headers, status_code, url)
    else:
        content_type = headers.get("content-type")
        if content_type and "text/html" in content_type:
            return HTMLResponseHandler(client, headers, status_code, url)
        return PlainResponseHandler(client, headers, status_code, url)
 def parse_uri(uri: str):
    parsed = urlparse(uri)
    # If there is no netloc, the url is invalid, so prepend `//` and try again
    if parsed.netloc == "":
        parsed = urlparse("//" + uri)
    host = parsed.netloc
    path = parsed.path
    if len(path) == 0 or path[0] != '/':
        path = "/" + path
    return host, path
 class ResponseHandler:
-    client: socket
+    client: HTTPClient
    url: str
    headers: Dict[str, str]
    status_code: int
    url: str
-    def __init__(self, url: str, client: socket):
+    def __init__(self, client: HTTPClient, headers: Dict[str, str], status_code: int, url: str):
        self.headers = {}
        self.url = url
        self.client = client
        self.headers = headers
        self.status_code = status_code
        self.url = url
        pass
-    def get_html_filename(self):
+    def handle(self, buffer: bytes):
        pass
    def get_filename(self):
        """Returns the filename to download the payload to.
        """
        filename = "index.html"
        parsed = urlparse(self.url)
        # If there is no netloc, the url is invalid, so prepend `//` and try again
        if parsed.netloc == "":
            parsed = urlparse("//" + self.url)
        # If the path contains a `/` get only the last part and use it as filename
        # If the path end with a `/`, it's a directory so ignore it.
        if len(parsed.path) != 0:
            index = parsed.path.rfind("/")
            if index == -1:
@@ -29,14 +113,179 @@ class ResponseHandler:
                filename = parsed.path[index:]
        result = os.path.basename(filename).strip()
        if any(letter.isalnum() for letter in result):
            return result
        return "index.html"
    def _handle_download(self, client, url):
        logging.debug("Waiting for response")
        try:
            buffer = client.receive()
        except TimeoutError:
            print("[ABRT] Response timed out")
            return
        try:
            (header_chunk, buffer) = client.get_crlf_chunk(buffer)
            (status_line, headers) = client.parse_headers(header_chunk)
            client.validate_status_line(status_line)
            status_code = int(status_line.split(" ")[1])
            if status_code != 200:
                raise InvalidResponse("Code not 200")
            response_handler = construct(client, headers, status_code, url)
            filename = response_handler.handle(buffer)
            return filename
        except InvalidResponse as e:
            logging.debug("Internal error: Response could not be parsed", exc_info=e)
            print("[ABRT] Invalid response")
            return
        except InvalidStatusLine as e:
            logging.debug("Internal error: Invalid status-line in response", exc_info=e)
            print("[ABRT] Invalid response")
            return
        except UnsupportedEncoding as e:
            logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
            print("[ABRT] Invalid response")
            return
 class PlainResponseHandler(ResponseHandler):
-    def __init__(self, url: str, client: socket):
+    def __init__(self, client: HTTPClient, headers, status_code, url):
-        super().__init__(url, client)
+        super().__init__(client, headers, status_code, url)
    def _get_payload_size(self):
        content_length = self.__get_content_length()
        if content_length == 0:
            logging.debug("content-length is 0")
            return None
        payload_size = content_length
        if not content_length:
            payload_size = -1
            logging.debug("No content-length specified")
        else:
            logging.debug("Expected content-length=%s", payload_size)
        return payload_size
    def handle(self, buffer: bytes):
        payload_size = self._get_payload_size()
        if payload_size is None:
            return
        logging.debug("Retrieving payload")
        filename = self.get_filename()
        file = open(filename, "wb")
        self._retrieve(file, buffer, payload_size)
        file.close()
        return filename
    def _retrieve(self, file, buffer: bytes, payload_size: int):
        file.write(buffer)
        cur_payload_size = len(buffer)
        while cur_payload_size < payload_size:
            buffer = self.client.receive()
            logging.debug("Received payload length: %s", len(buffer))
            if len(buffer) == 0:
                logging.warning("Received payload length %s less than expected %s", cur_payload_size, payload_size)
                break
            cur_payload_size += len(buffer)
            logging.debug("Processed payload: %r", cur_payload_size)
            file.write(buffer)
    def __get_content_length(self):
        content_length = self.headers.get("content-length")
        if not content_length:
            return None
        return int(content_length)
 class HTMLResponseHandler(PlainResponseHandler):
    def __init__(self, client: HTTPClient, headers, status_code, url):
        super().__init__(client, headers, status_code, url)
    def handle(self, buffer: bytes):
        payload_size = self._get_payload_size()
        if payload_size is None:
            return
        logging.debug("Retrieving payload")
        filename = self.get_filename()
        tmp_filename = "." + filename + ".tmp"
        file = open(tmp_filename, "wb")
        self._retrieve(file, buffer, payload_size)
        file.close()
        self.__download_images(tmp_filename, filename)
        os.remove(tmp_filename)
        return filename
    def __download_images(self, tmp_filename, target_filename):
        (host, path) = parse_uri(self.url)
        with open(tmp_filename, "r") as fp:
            soup = BeautifulSoup(fp, "lxml")
            for tag in soup.find_all("img"):
                try:
                    tag["src"] = self.__download_image(tag["src"], host, path)
                except Exception as e:
                    logging.error("Failed to download image, skipping...", exc_info=e)
        with open(target_filename, 'w') as file:
            file.write(str(soup))
    def __download_image(self, img_src, host, path):
        parsed = urlparse(img_src)
        same_host = True
        if len(parsed.netloc) == 0 or parsed.netloc == host:
            img_host = host
            if parsed.path[0] != "/":
                base = os.path.split(path)[0]
                if base[-1] != '/':
                    base += "/"
                img_path = base + parsed.path
            else:
                img_path = parsed.path
        else:
            same_host = False
            (img_host, img_path) = parse_uri(img_src)
        message = "GET {path} HTTP/1.1\r\n".format(path=img_path)
        message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
        message += "Host: {host}\r\n\r\n".format(host=host)
        message = message.encode(FORMAT)
        if same_host:
            client = self.client
        else:
            client = HTTPClient(img_src)
            client.connect((img_host, 80))
        client.sendall(message)
        filename = self._handle_download(client, img_host + img_path)
        if not same_host:
            client.close()
        return filename
 class ChunkedResponseHandler(ResponseHandler):
-    def __init__(self, url: str, client: socket):
+    def __init__(self, client: HTTPClient, headers, status_code, url):
-        super().__init__(url, client)
+        super().__init__(client, headers, status_code, url)
    def handle(self, buffer: bytes):
        return None
--- a/client/httpclient.py
+++ b/client/httpclient.py
@@ -0,0 +1,131 @@
 import logging
 import re
 import socket
 from typing import Dict
 BUFSIZE = 4096
 TIMEOUT = 3
 FORMAT = "UTF-8"
 class HTTPClient(socket.socket):
    host: str
    def __init__(self, host: str):
        super().__init__(socket.AF_INET, socket.SOCK_STREAM)
        self.settimeout(TIMEOUT)
        self.host = host
    def _do_receive(self):
        if self.fileno() == -1:
            raise Exception("Connection closed")
        result = self.recv(BUFSIZE)
        return result
    def receive(self):
        """Receive data from the client up to BUFSIZE
        """
        count = 0
        while True:
            count += 1
            try:
                return self._do_receive()
            except socket.timeout:
                logging.debug("Socket receive timed out after %s seconds", TIMEOUT)
                if count == 3:
                    break
                logging.debug("Retrying %s", count)
        logging.debug("Timed out after waiting %s seconds for response", TIMEOUT * count)
        raise TimeoutError("Request timed out")
    def validate_status_line(self, status_line: str):
        split = list(filter(None, status_line.split(" ")))
        if len(split) < 3:
            return False
        # Check HTTP version
        http_version = split.pop(0)
        if len(http_version) < 8 or http_version[4] != "/":
            raise InvalidStatusLine(status_line)
        (name, version) = http_version[:4], http_version[5:]
        if name != "HTTP" or not re.match(r"1\.[0|1]", version):
            return False
        if not re.match(r"\d{3}", split[0]):
            return False
        return True
    def get_crlf_chunk(self, buffer: bytes):
        """Finds the line break type (`CRLF` or `LF`) and splits the specified buffer
        when encountering 2 consecutive linebreaks.
        Returns a tuple with the first part and the remaining of the buffer.
        :param buffer:
        :return:
        """
        lf_pos = buffer.find(b"\n\n")
        crlf_pos = buffer.find(b"\r\n\r\n")
        if lf_pos != -1 and lf_pos < crlf_pos:
            split_start = lf_pos
            split_end = lf_pos + 2
        else:
            split_start = crlf_pos
            split_end = crlf_pos + 4
        return buffer[:split_start], buffer[split_end:]
    def parse_headers(self, data: bytes):
        headers = {}
        # decode bytes, split into lines and filter
        header_split = list(
            filter(lambda l: l is not "" and not l[0].isspace(), map(str.strip, data.decode("utf-8").split("\n"))))
        if len(header_split) == 0:
            raise InvalidResponse(data)
        start_line = header_split.pop(0)
        logging.debug("start-line: %r", start_line)
        for line in header_split:
            pos = line.find(":")
            if pos <= 0 or pos >= len(line) - 1:
                continue
            (header, value) = map(str.strip, line.split(":", 1))
            headers[header.lower()] = value.lower()
        logging.debug("Parsed headers: %r", headers)
        return start_line, headers
 class HTTPException(Exception):
    """ Base class for HTTP exceptions """
 class InvalidResponse(HTTPException):
    """ Response message cannot be parsed """
    def __init(self, message):
        self.message = message
 class InvalidStatusLine(HTTPException):
    """ Response status line is invalid """
    def __init(self, line):
        self.line = line
 class UnsupportedEncoding(HTTPException):
    """ Reponse Encoding not support """
    def __init(self, enc_type, encoding):
        self.enc_type = enc_type
        self.encoding = encoding
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
 beautifulsoup4~=4.9.3
 lxml==4.6.2