Parse html with regex, fix small issues

This commit is contained in:
2021-03-27 23:41:28 +01:00
parent bbca6f603b
commit 4473d1bec9
7 changed files with 134 additions and 80 deletions

View File

@@ -1,9 +1,11 @@
import logging
import re
import urllib
from typing import Dict
from urllib.parse import urlparse, urlsplit
from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest, InvalidRequestLine
from httplib.httpsocket import FORMAT
def _is_valid_http_version(http_version: str):
@@ -164,7 +166,7 @@ def get_uri(url: str):
parsed = urlsplit(url)
result = f"http://{parsed.netloc}{parsed.path}"
if parsed.query != '':
if parsed.query != "":
result = f"{result}?{parsed.query}"
return result
@@ -175,3 +177,13 @@ def urljoin(base, url):
Join a base url and a URL to form a absolute url.
"""
return urllib.parse.urljoin(base, url)
def get_charset(headers: Dict[str, str]):
if "content-type" in headers:
content_type = headers["content-type"]
match = re.search(r"charset\s*=\s*([a-z\-0-9]*)", content_type, re.I)
if match:
return match.group(1)
return FORMAT