client: cleanup

2021-03-21 00:01:31 +01:00
parent fa8d08d63d
commit d8a5765fd8
4 changed files with 242 additions and 374 deletions
--- a/client/ResponseHandler.py
+++ b/client/ResponseHandler.py
@@ -10,121 +10,6 @@ from client.Retriever import Retriever
 from client.httpclient import HTTPClient, UnsupportedEncoding, FORMAT, InvalidResponse, InvalidStatusLine


-def handle(client: HTTPClient, url: str):
-    logging.debug("Waiting for response")
-
-    try:
-        (version, status, _) = get_status_line(client)
-        logging.debug("Parsed status-line: version: %s, status: %s", version, status)
-        headers = get_headers(client)
-        logging.debug("Parsed headers: %r", headers)
-
-        response_handler = construct(client, headers, status, url)
-        response_handler.handle()
-
-    except InvalidResponse as e:
-        logging.debug("Internal error: Response could not be parsed", exc_info=e)
-        return
-    except InvalidStatusLine as e:
-        logging.debug("Internal error: Invalid status-line in response", exc_info=e)
-        return
-    except UnsupportedEncoding as e:
-        logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
-        return
-
-
-def get_status_line(client: HTTPClient):
-    line = client.read_line()
-
-    split = list(filter(None, line.split(" ")))
-    if len(split) < 3:
-        raise InvalidStatusLine(line)
-
-    # Check HTTP version
-    http_version = split.pop(0)
-    if len(http_version) < 8 or http_version[4] != "/":
-        raise InvalidStatusLine(line)
-
-    (name, version) = http_version[:4], http_version[5:]
-    if name != "HTTP" or not re.match(r"1\.[0|1]", version):
-        raise InvalidStatusLine(line)
-
-    status = split.pop(0)
-    if not re.match(r"\d{3}", status):
-        raise InvalidStatusLine(line)
-    status = int(status)
-    if status < 100 or status > 999:
-        raise InvalidStatusLine(line)
-
-    reason = split.pop(0)
-    return version, status, reason
-
-
-def get_headers(client: HTTPClient):
-    headers = []
-    # first header after the status-line may not contain a space
-    while True:
-        line = client.read_line()
-        if line[0].isspace():
-            continue
-        else:
-            break
-
-    while True:
-        if line in ("\r\n", "\n", " "):
-            break
-
-        if line[0].isspace():
-            headers[-1] = headers[-1].rstrip("\r\n")
-
-        headers.append(line.lstrip())
-        line = client.read_line()
-
-    result = {}
-    header_str = "".join(headers)
-    for line in header_str.splitlines():
-        pos = line.find(":")
-
-        if pos <= 0 or pos >= len(line) - 1:
-            continue
-
-        (header, value) = map(str.strip, line.split(":", 1))
-        check_next_header(result, header, value)
-        result[header.lower()] = value.lower()
-
-    return result
-
-
-def check_next_header(headers, next_header: str, next_value: str):
-    if next_header == "content-length":
-        if "content-length" in headers:
-            logging.error("Multiple content-length headers specified")
-            raise InvalidResponse()
-        if not next_value.isnumeric() or int(next_value) <= 0:
-            logging.error("Invalid content-length value: %r", next_value)
-            raise InvalidResponse()
-
-
-def construct(client: HTTPClient, headers, status_code, url):
-    # only chunked transfer-encoding is supported
-    transfer_encoding = headers.get("transfer-encoding")
-    if transfer_encoding and transfer_encoding != "chunked":
-        raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
-    chunked = transfer_encoding
-
-    # content-encoding is not supported
-    content_encoding = headers.get("content-encoding")
-    if content_encoding:
-        raise UnsupportedEncoding("content-encoding", content_encoding)
-
-    retriever = Retriever.create(client, headers)
-
-    content_type = headers.get("content-type")
-    if content_type and "text/html" in content_type:
-        return HTMLDownloadHandler(retriever, client, headers, url)
-    return RawDownloadHandler(retriever, client, headers, url)
-
-
 def parse_uri(uri: str):
    parsed = urlparse(uri)

@@ -156,6 +41,98 @@ class ResponseHandler:
    def handle(self):
        pass

+    @staticmethod
+    def create(client: HTTPClient, headers, status_code, url):
+        # only chunked transfer-encoding is supported
+        transfer_encoding = headers.get("transfer-encoding")
+        if transfer_encoding and transfer_encoding != "chunked":
+            raise UnsupportedEncoding("transfer-encoding", transfer_encoding)
+        chunked = transfer_encoding
+
+        # content-encoding is not supported
+        content_encoding = headers.get("content-encoding")
+        if content_encoding:
+            raise UnsupportedEncoding("content-encoding", content_encoding)
+
+        retriever = Retriever.create(client, headers)
+
+        content_type = headers.get("content-type")
+        if content_type and "text/html" in content_type:
+            return HTMLDownloadHandler(retriever, client, headers, url)
+        return RawDownloadHandler(retriever, client, headers, url)
+
+    @staticmethod
+    def get_status_line(client: HTTPClient):
+        line = client.read_line()
+
+        split = list(filter(None, line.split(" ")))
+        if len(split) < 3:
+            raise InvalidStatusLine(line)
+
+        # Check HTTP version
+        http_version = split.pop(0)
+        if len(http_version) < 8 or http_version[4] != "/":
+            raise InvalidStatusLine(line)
+
+        (name, version) = http_version[:4], http_version[5:]
+        if name != "HTTP" or not re.match(r"1\.[0|1]", version):
+            raise InvalidStatusLine(line)
+
+        status = split.pop(0)
+        if not re.match(r"\d{3}", status):
+            raise InvalidStatusLine(line)
+        status = int(status)
+        if status < 100 or status > 999:
+            raise InvalidStatusLine(line)
+
+        reason = split.pop(0)
+        return version, status, reason
+
+    @staticmethod
+    def get_headers(client: HTTPClient):
+        headers = []
+        # first header after the status-line may not contain a space
+        while True:
+            line = client.read_line()
+            if line[0].isspace():
+                continue
+            else:
+                break
+
+        while True:
+            if line in ("\r\n", "\n", " "):
+                break
+
+            if line[0].isspace():
+                headers[-1] = headers[-1].rstrip("\r\n")
+
+            headers.append(line.lstrip())
+            line = client.read_line()
+
+        result = {}
+        header_str = "".join(headers)
+        for line in header_str.splitlines():
+            pos = line.find(":")
+
+            if pos <= 0 or pos >= len(line) - 1:
+                continue
+
+            (header, value) = map(str.strip, line.split(":", 1))
+            ResponseHandler.check_next_header(result, header, value)
+            result[header.lower()] = value.lower()
+
+        return result
+
+    @staticmethod
+    def check_next_header(headers, next_header: str, next_value: str):
+        if next_header == "content-length":
+            if "content-length" in headers:
+                logging.error("Multiple content-length headers specified")
+                raise InvalidResponse()
+            if not next_value.isnumeric() or int(next_value) <= 0:
+                logging.error("Invalid content-length value: %r", next_value)
+                raise InvalidResponse()
+

 class DownloadHandler(ResponseHandler):
    path: str
@@ -220,9 +197,9 @@ class DownloadHandler(ResponseHandler):

    def _handle_sub_request(self, client, url):

-        (version, status, _) = get_status_line(client)
+        (version, status, _) = self.get_status_line(client)
        logging.debug("Parsed status-line: version: %s, status: %s", version, status)
-        headers = get_headers(client)
+        headers = self.get_headers(client)
        logging.debug("Parsed headers: %r", headers)

        if status != 200:
@@ -275,30 +252,38 @@ class HTMLDownloadHandler(DownloadHandler):
        with open(tmp_filename, "rb") as fp:
            soup = BeautifulSoup(fp, 'html.parser')

+            base_url = self.url
+            base_element = soup.find("base")
+
+            if base_element:
+                base_url = base_element["href"]
+
            for tag in soup.find_all("img"):
                try:
-                    tag["src"] = self.__download_image(tag["src"], host, path)
+                    tag["src"] = self.__download_image(tag["src"], host, base_url)
                except Exception as e:
-                    logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
+                    logging.debug(e)
+                    logging.error("Failed to download image: %s, skipping...", tag["src"])

        with open(target_filename, 'w') as file:
            file.write(str(soup))

-    def __download_image(self, img_src, host, path):
+    def __download_image(self, img_src, host, base_url):
        parsed = urlparse(img_src)

        logging.debug("Downloading image: %s", img_src)

-        same_host = True
+        if len(parsed.netloc) == 0 and parsed.path != "/":
+            # relative url, append base_url
+            img_src = os.path.join(os.path.dirname(base_url), parsed.path)
+
+        parsed = urlparse(img_src)
+
+        # Check if the image is located on the same server
        if len(parsed.netloc) == 0 or parsed.netloc == host:
+            same_host = True
            img_host = host
-            if parsed.path[0] != "/":
-                base = os.path.split(path)[0]
-                if base[-1] != '/':
-                    base += "/"
-                img_path = base + parsed.path
-            else:
-                img_path = parsed.path
+            img_path = parsed.path
        else:
            same_host = False
            (img_host, img_path) = parse_uri(img_src)