Parse html with regex, fix small issues
This commit is contained in:
@@ -1,9 +1,11 @@
|
||||
import logging
|
||||
import re
|
||||
import urllib
|
||||
from typing import Dict
|
||||
from urllib.parse import urlparse, urlsplit
|
||||
|
||||
from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest, InvalidRequestLine
|
||||
from httplib.httpsocket import FORMAT
|
||||
|
||||
|
||||
def _is_valid_http_version(http_version: str):
|
||||
@@ -164,7 +166,7 @@ def get_uri(url: str):
|
||||
parsed = urlsplit(url)
|
||||
|
||||
result = f"http://{parsed.netloc}{parsed.path}"
|
||||
if parsed.query != '':
|
||||
if parsed.query != "":
|
||||
result = f"{result}?{parsed.query}"
|
||||
|
||||
return result
|
||||
@@ -175,3 +177,13 @@ def urljoin(base, url):
|
||||
Join a base url and a URL to form a absolute url.
|
||||
"""
|
||||
return urllib.parse.urljoin(base, url)
|
||||
|
||||
|
||||
def get_charset(headers: Dict[str, str]):
|
||||
if "content-type" in headers:
|
||||
content_type = headers["content-type"]
|
||||
match = re.search(r"charset\s*=\s*([a-z\-0-9]*)", content_type, re.I)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return FORMAT
|
||||
|
Reference in New Issue
Block a user