Parse html with regex, fix small issues

2021-03-27 23:41:28 +01:00
parent bbca6f603b
commit 4473d1bec9
7 changed files with 134 additions and 80 deletions
--- a/httplib/parser.py
+++ b/httplib/parser.py
@@ -1,9 +1,11 @@
 import logging
 import re
 import urllib
+from typing import Dict
 from urllib.parse import urlparse, urlsplit

 from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest, InvalidRequestLine
+from httplib.httpsocket import FORMAT


 def _is_valid_http_version(http_version: str):
@@ -164,7 +166,7 @@ def get_uri(url: str):
    parsed = urlsplit(url)

    result = f"http://{parsed.netloc}{parsed.path}"
-    if parsed.query != '':
+    if parsed.query != "":
        result = f"{result}?{parsed.query}"

    return result
@@ -175,3 +177,13 @@ def urljoin(base, url):
    Join a base url and a URL to form a absolute url.
    """
    return urllib.parse.urljoin(base, url)
+
+
+def get_charset(headers: Dict[str, str]):
+    if "content-type" in headers:
+        content_type = headers["content-type"]
+        match = re.search(r"charset\s*=\s*([a-z\-0-9]*)", content_type, re.I)
+        if match:
+            return match.group(1)
+
+    return FORMAT