Update
This commit is contained in:
@@ -3,7 +3,7 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from client.command import AbstractCommand
|
from client import command as cmd
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -18,7 +18,7 @@ def main():
|
|||||||
logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose))
|
logging.basicConfig(level=logging.ERROR - (10 * arguments.verbose))
|
||||||
logging.debug("Arguments: %s", arguments)
|
logging.debug("Arguments: %s", arguments)
|
||||||
|
|
||||||
command = AbstractCommand.create(arguments.command, arguments.URI, arguments.port)
|
command = cmd.create(arguments.command, arguments.URI, arguments.port)
|
||||||
command.execute()
|
command.execute()
|
||||||
|
|
||||||
|
|
||||||
|
@@ -1,25 +1,18 @@
|
|||||||
import logging
|
import logging
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Dict, Tuple
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from client.response_handler import ResponseHandler
|
|
||||||
from client.httpclient import FORMAT, HTTPClient
|
from client.httpclient import FORMAT, HTTPClient
|
||||||
from httplib import parser
|
from httplib import parser
|
||||||
from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding
|
from httplib.exceptions import InvalidResponse, InvalidStatusLine, UnsupportedEncoding
|
||||||
|
from httplib.message import Message
|
||||||
|
from httplib.retriever import PreambleRetriever
|
||||||
|
|
||||||
class AbstractCommand(ABC):
|
sockets: Dict[str, HTTPClient] = {}
|
||||||
|
|
||||||
def __init__(self, url: str, port: str):
|
|
||||||
self.url = url
|
|
||||||
self.port = port
|
|
||||||
|
|
||||||
@property
|
def create(command: str, url: str, port):
|
||||||
@abstractmethod
|
|
||||||
def command(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def create(command: str, url: str, port: str):
|
|
||||||
if command == "GET":
|
if command == "GET":
|
||||||
return GetCommand(url, port)
|
return GetCommand(url, port)
|
||||||
elif command == "HEAD":
|
elif command == "HEAD":
|
||||||
@@ -32,6 +25,22 @@ class AbstractCommand(ABC):
|
|||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractCommand(ABC):
|
||||||
|
uri: str
|
||||||
|
host: str
|
||||||
|
path: str
|
||||||
|
port: Tuple[str, int]
|
||||||
|
|
||||||
|
def __init__(self, uri: str, port):
|
||||||
|
self.uri = uri
|
||||||
|
self.host, _, self.path = parser.parse_uri(uri)
|
||||||
|
self.port = port
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def command(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def build_message(command, host, path):
|
def build_message(command, host, path):
|
||||||
message = f"{command} {path} HTTP/1.1\r\n"
|
message = f"{command} {path} HTTP/1.1\r\n"
|
||||||
@@ -40,26 +49,34 @@ class AbstractCommand(ABC):
|
|||||||
|
|
||||||
return message.encode(FORMAT)
|
return message.encode(FORMAT)
|
||||||
|
|
||||||
def execute(self):
|
def execute(self, sub_request=False):
|
||||||
(host, path) = self.parse_uri()
|
(host, path) = self.parse_uri()
|
||||||
|
|
||||||
|
client = sockets.get(host)
|
||||||
|
|
||||||
|
if client and client.is_closed():
|
||||||
|
sockets.pop(self.host)
|
||||||
|
client = None
|
||||||
|
|
||||||
|
if not client:
|
||||||
client = HTTPClient(host)
|
client = HTTPClient(host)
|
||||||
client.conn.connect((host, int(self.port)))
|
client.conn.connect((host, self.port))
|
||||||
|
sockets[host] = client
|
||||||
|
|
||||||
message = f"{self.command} {path} HTTP/1.1\r\n"
|
message = f"{self.command} {path} HTTP/1.1\r\n"
|
||||||
message += f"Host: {host}\r\n"
|
message += f"Host: {host}:{self.port}\r\n"
|
||||||
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
||||||
encoded_msg = self._build_message(message)
|
encoded_msg = self._build_message(message)
|
||||||
|
|
||||||
logging.info("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
|
logging.debug("---request begin---\r\n%s---request end---", encoded_msg.decode(FORMAT))
|
||||||
|
|
||||||
logging.debug("Sending HTTP message: %r", encoded_msg)
|
|
||||||
client.conn.sendall(encoded_msg)
|
client.conn.sendall(encoded_msg)
|
||||||
|
|
||||||
logging.info("HTTP request sent, awaiting response...")
|
logging.info("HTTP request sent, awaiting response...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._await_response(client)
|
retriever = PreambleRetriever(client)
|
||||||
|
self._await_response(client, retriever)
|
||||||
except InvalidResponse as e:
|
except InvalidResponse as e:
|
||||||
logging.debug("Internal error: Response could not be parsed", exc_info=e)
|
logging.debug("Internal error: Response could not be parsed", exc_info=e)
|
||||||
return
|
return
|
||||||
@@ -69,9 +86,10 @@ class AbstractCommand(ABC):
|
|||||||
except UnsupportedEncoding as e:
|
except UnsupportedEncoding as e:
|
||||||
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
|
logging.debug("Internal error: Unsupported encoding in response", exc_info=e)
|
||||||
finally:
|
finally:
|
||||||
|
if not sub_request:
|
||||||
client.close()
|
client.close()
|
||||||
|
|
||||||
def _await_response(self, client):
|
def _await_response(self, client, retriever):
|
||||||
while True:
|
while True:
|
||||||
line = client.read_line()
|
line = client.read_line()
|
||||||
print(line, end="")
|
print(line, end="")
|
||||||
@@ -82,11 +100,11 @@ class AbstractCommand(ABC):
|
|||||||
return (message + "\r\n").encode(FORMAT)
|
return (message + "\r\n").encode(FORMAT)
|
||||||
|
|
||||||
def parse_uri(self):
|
def parse_uri(self):
|
||||||
parsed = urlparse(self.url)
|
parsed = urlparse(self.uri)
|
||||||
|
|
||||||
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
||||||
if parsed.netloc == "":
|
if parsed.netloc == "":
|
||||||
parsed = urlparse("//" + self.url)
|
parsed = urlparse("//" + self.uri)
|
||||||
|
|
||||||
host = parsed.netloc
|
host = parsed.netloc
|
||||||
path = parsed.path
|
path = parsed.path
|
||||||
@@ -105,6 +123,7 @@ class AbstractWithBodyCommand(AbstractCommand, ABC):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def build_message(command, host, path):
|
def build_message(command, host, path):
|
||||||
message = AbstractCommand.build_message()
|
message = AbstractCommand.build_message()
|
||||||
|
|
||||||
def _build_message(self, message: str) -> bytes:
|
def _build_message(self, message: str) -> bytes:
|
||||||
body = input(f"Enter {self.command} data: ").encode(FORMAT)
|
body = input(f"Enter {self.command} data: ").encode(FORMAT)
|
||||||
print()
|
print()
|
||||||
@@ -126,18 +145,31 @@ class HeadCommand(AbstractCommand):
|
|||||||
|
|
||||||
|
|
||||||
class GetCommand(AbstractCommand):
|
class GetCommand(AbstractCommand):
|
||||||
|
|
||||||
|
def __init__(self, uri: str, port, dir=None):
|
||||||
|
super().__init__(uri, port)
|
||||||
|
self.dir = dir
|
||||||
|
self.filename = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def command(self):
|
def command(self):
|
||||||
return "GET"
|
return "GET"
|
||||||
|
|
||||||
def _await_response(self, client):
|
def _get_preamble(self, retriever):
|
||||||
(version, status, msg) = parser.get_status_line(client)
|
lines = retriever.retrieve()
|
||||||
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
|
(version, status, msg) = parser.parse_status_line(next(lines))
|
||||||
headers = parser.get_headers(client)
|
headers = parser.parse_headers(lines)
|
||||||
logging.debug("Parsed headers: %r", headers)
|
|
||||||
|
|
||||||
handler = ResponseHandler.create(client, headers, status, self.url)
|
logging.debug("---response begin---\r\n%s--- response end---", "".join(retriever.buffer))
|
||||||
handler.handle()
|
|
||||||
|
return Message(version, status, msg, headers)
|
||||||
|
|
||||||
|
def _await_response(self, client, retriever) -> str:
|
||||||
|
msg = self._get_preamble(retriever)
|
||||||
|
|
||||||
|
from client import response_handler
|
||||||
|
self.filename = response_handler.handle(client, msg, self, self.dir)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
class PostCommand(AbstractWithBodyCommand):
|
class PostCommand(AbstractWithBodyCommand):
|
||||||
|
6
client/htmlparser.py
Normal file
6
client/htmlparser.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLParser:
|
||||||
|
def __init__(self, soup: BeautifulSoup):
|
||||||
|
pass
|
@@ -2,52 +2,57 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Dict
|
from urllib.parse import urlsplit, unquote
|
||||||
from urllib.parse import urlparse, unquote
|
|
||||||
|
|
||||||
import cssutils
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
from client.command import AbstractCommand, GetCommand
|
||||||
from client.httpclient import HTTPClient, FORMAT
|
from client.httpclient import HTTPClient, FORMAT
|
||||||
from httplib import parser
|
from httplib import parser
|
||||||
from httplib.exceptions import InvalidResponse
|
from httplib.exceptions import InvalidResponse
|
||||||
|
from httplib.message import Message
|
||||||
from httplib.retriever import Retriever
|
from httplib.retriever import Retriever
|
||||||
|
|
||||||
|
|
||||||
|
def handle(client: HTTPClient, msg: Message, command: AbstractCommand, dir=None):
|
||||||
|
handler = BasicResponseHandler(client, msg, command)
|
||||||
|
retriever = handler.handle()
|
||||||
|
|
||||||
|
if retriever is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
content_type = msg.headers.get("content-type")
|
||||||
|
if content_type and "text/html" in content_type:
|
||||||
|
handler = HTMLDownloadHandler(retriever, client, msg, command, dir)
|
||||||
|
else:
|
||||||
|
handler = RawDownloadHandler(retriever, client, msg, command, dir)
|
||||||
|
|
||||||
|
return handler.handle()
|
||||||
|
|
||||||
|
|
||||||
class ResponseHandler(ABC):
|
class ResponseHandler(ABC):
|
||||||
client: HTTPClient
|
client: HTTPClient
|
||||||
headers: Dict[str, str]
|
|
||||||
status_code: int
|
|
||||||
url: str
|
|
||||||
retriever: Retriever
|
retriever: Retriever
|
||||||
|
msg: Message
|
||||||
|
cmd: AbstractCommand
|
||||||
|
|
||||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str):
|
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd):
|
||||||
self.client = client
|
self.client = client
|
||||||
self.headers = headers
|
|
||||||
self.url = url
|
|
||||||
self.retriever = retriever
|
self.retriever = retriever
|
||||||
pass
|
self.msg = msg
|
||||||
|
self.cmd = cmd
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def handle(self):
|
def handle(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def create(client: HTTPClient, headers, status_code, url):
|
|
||||||
retriever = Retriever.create(client, headers)
|
|
||||||
|
|
||||||
content_type = headers.get("content-type")
|
|
||||||
if content_type and "text/html" in content_type:
|
|
||||||
return HTMLDownloadHandler(retriever, client, headers, url)
|
|
||||||
return RawDownloadHandler(retriever, client, headers, url)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_uri(uri: str):
|
def parse_uri(uri: str):
|
||||||
parsed = urlparse(uri)
|
parsed = urlsplit(uri)
|
||||||
|
|
||||||
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
||||||
if parsed.netloc == "":
|
if parsed.netloc == "":
|
||||||
parsed = urlparse("//" + uri)
|
parsed = urlsplit("//" + uri)
|
||||||
|
|
||||||
host = parsed.netloc
|
host = parsed.netloc
|
||||||
path = parsed.path
|
path = parsed.path
|
||||||
@@ -56,11 +61,79 @@ class ResponseHandler(ABC):
|
|||||||
return host, path
|
return host, path
|
||||||
|
|
||||||
|
|
||||||
class DownloadHandler(ResponseHandler, ABC):
|
class BasicResponseHandler(ResponseHandler):
|
||||||
path: str
|
""" Response handler which throws away the body and only shows the headers.
|
||||||
|
In case of a redirect, it will process it and pass it to the appropriate response handler.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
def __init__(self, client: HTTPClient, msg: Message, cmd: AbstractCommand):
|
||||||
super().__init__(retriever, client, headers, url)
|
retriever = Retriever.create(client, msg.headers)
|
||||||
|
super().__init__(retriever, client, msg, cmd)
|
||||||
|
|
||||||
|
def handle(self):
|
||||||
|
return self._handle_status()
|
||||||
|
|
||||||
|
def _skip_body(self):
|
||||||
|
logging.debug("Skipping body: [")
|
||||||
|
for line in self.retriever.retrieve():
|
||||||
|
try:
|
||||||
|
logging.debug("%s", line.decode(FORMAT))
|
||||||
|
except Exception:
|
||||||
|
logging.debug("%r", line)
|
||||||
|
|
||||||
|
logging.debug("] done.")
|
||||||
|
|
||||||
|
def _handle_status(self):
|
||||||
|
logging.info("%d %s", self.msg.status, self.msg.msg)
|
||||||
|
|
||||||
|
if self.msg.status == 101:
|
||||||
|
# Switching protocols is not supported
|
||||||
|
print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
|
||||||
|
print(self.msg.headers)
|
||||||
|
return
|
||||||
|
|
||||||
|
if 200 <= self.msg.status < 300:
|
||||||
|
return self.retriever
|
||||||
|
|
||||||
|
if 300 <= self.msg.status < 400:
|
||||||
|
# Redirect
|
||||||
|
return self._do_handle_redirect()
|
||||||
|
if 400 <= self.msg.status < 500:
|
||||||
|
# Dump headers and exit with error
|
||||||
|
print(f"{self.msg.version} {self.msg.status} {self.msg.msg}")
|
||||||
|
print(self.msg.headers)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _do_handle_redirect(self):
|
||||||
|
self._skip_body()
|
||||||
|
|
||||||
|
location = self.msg.headers.get("location")
|
||||||
|
if not location:
|
||||||
|
raise InvalidResponse("No location in redirect")
|
||||||
|
|
||||||
|
parsed_location = urlsplit(location)
|
||||||
|
if not parsed_location.hostname:
|
||||||
|
raise InvalidResponse("Invalid location")
|
||||||
|
|
||||||
|
if not parsed_location.scheme == "http":
|
||||||
|
raise InvalidResponse("Only http is supported")
|
||||||
|
|
||||||
|
self.cmd.uri = location
|
||||||
|
self.cmd.host, self.cmd.port, self.cmd.path = parser.parse_uri(location)
|
||||||
|
|
||||||
|
if self.msg.status == 301:
|
||||||
|
logging.info("Status 301. Closing socket [%s]", self.cmd.host)
|
||||||
|
self.client.close()
|
||||||
|
|
||||||
|
self.cmd.execute()
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class DownloadHandler(ResponseHandler, ABC):
|
||||||
|
|
||||||
|
def __init__(self, retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
|
||||||
|
super().__init__(retriever, client, msg, cmd)
|
||||||
|
|
||||||
if not dir:
|
if not dir:
|
||||||
dir = self._create_directory()
|
dir = self._create_directory()
|
||||||
@@ -68,11 +141,11 @@ class DownloadHandler(ResponseHandler, ABC):
|
|||||||
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
|
self.path = self._get_duplicate_name(os.path.join(dir, self.get_filename()))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create(retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
def create(retriever: Retriever, client: HTTPClient, msg, cmd, dir=None):
|
||||||
content_type = headers.get("content-type")
|
content_type = msg.headers.get("content-type")
|
||||||
if content_type and "text/html" in content_type:
|
if content_type and "text/html" in content_type:
|
||||||
return HTMLDownloadHandler(retriever, client, headers, url, dir)
|
return HTMLDownloadHandler(retriever, client, msg, cmd, dir)
|
||||||
return RawDownloadHandler(retriever, client, headers, url, dir)
|
return RawDownloadHandler(retriever, client, msg, cmd, dir)
|
||||||
|
|
||||||
def _create_directory(self):
|
def _create_directory(self):
|
||||||
path = self._get_duplicate_name(os.path.abspath(self.client.host))
|
path = self._get_duplicate_name(os.path.abspath(self.client.host))
|
||||||
@@ -91,54 +164,25 @@ class DownloadHandler(ResponseHandler, ABC):
|
|||||||
def get_filename(self):
|
def get_filename(self):
|
||||||
"""Returns the filename to download the payload to.
|
"""Returns the filename to download the payload to.
|
||||||
"""
|
"""
|
||||||
filename = "index.html"
|
filename = os.path.basename(self.cmd.path)
|
||||||
|
if filename == '':
|
||||||
parsed = urlparse(self.url)
|
return "index.html"
|
||||||
|
|
||||||
# If there is no netloc, the url is invalid, so prepend `//` and try again
|
|
||||||
if parsed.netloc == "":
|
|
||||||
parsed = urlparse("//" + self.url)
|
|
||||||
|
|
||||||
# If the path contains a `/` get only the last part and use it as filename
|
|
||||||
# If the path end with a `/`, it's a directory so ignore it.
|
|
||||||
if len(parsed.path) != 0:
|
|
||||||
index = parsed.path.rfind("/")
|
|
||||||
if index == -1:
|
|
||||||
filename = parsed.path
|
|
||||||
elif parsed.path[-1] != "/":
|
|
||||||
filename = parsed.path[index:]
|
|
||||||
|
|
||||||
while "%" in filename:
|
while "%" in filename:
|
||||||
filename = unquote(filename)
|
filename = unquote(filename)
|
||||||
|
|
||||||
filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
|
filename = re.sub(r"[^\w.+-]+[.]*", '', filename)
|
||||||
|
|
||||||
result = os.path.basename(filename).strip()
|
result = os.path.basename(filename).strip()
|
||||||
if any(letter.isalnum() for letter in result):
|
if any(letter.isalnum() for letter in result):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
return "index.html"
|
return "index.html"
|
||||||
|
|
||||||
def _handle_sub_request(self, client, url):
|
|
||||||
|
|
||||||
(version, status, _) = parser.get_status_line(client)
|
|
||||||
logging.debug("Parsed status-line: version: %s, status: %s", version, status)
|
|
||||||
headers = parser.get_headers(client)
|
|
||||||
logging.debug("Parsed headers: %r", headers)
|
|
||||||
|
|
||||||
if status != 200:
|
|
||||||
raise InvalidResponse("Status not expected 200: " + str(status))
|
|
||||||
|
|
||||||
retriever = Retriever.create(client, headers)
|
|
||||||
handler = RawDownloadHandler(retriever, client, headers, url, os.path.dirname(self.path))
|
|
||||||
|
|
||||||
return handler.handle()
|
|
||||||
|
|
||||||
|
|
||||||
class RawDownloadHandler(DownloadHandler):
|
class RawDownloadHandler(DownloadHandler):
|
||||||
|
|
||||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
|
||||||
super().__init__(retriever, client, headers, url, dir)
|
super().__init__(retriever, client, msg, cmd, dir)
|
||||||
|
|
||||||
def handle(self) -> str:
|
def handle(self) -> str:
|
||||||
logging.debug("Retrieving payload")
|
logging.debug("Retrieving payload")
|
||||||
@@ -152,8 +196,8 @@ class RawDownloadHandler(DownloadHandler):
|
|||||||
|
|
||||||
|
|
||||||
class HTMLDownloadHandler(DownloadHandler):
|
class HTMLDownloadHandler(DownloadHandler):
|
||||||
def __init__(self, retriever: Retriever, client: HTTPClient, headers: Dict[str, str], url: str, dir=None):
|
def __init__(self, retriever: Retriever, client: HTTPClient, msg: Message, cmd: AbstractCommand, dir=None):
|
||||||
super().__init__(retriever, client, headers, url, dir)
|
super().__init__(retriever, client, msg, cmd, dir)
|
||||||
|
|
||||||
def handle(self) -> str:
|
def handle(self) -> str:
|
||||||
|
|
||||||
@@ -172,11 +216,11 @@ class HTMLDownloadHandler(DownloadHandler):
|
|||||||
|
|
||||||
def _download_images(self, tmp_filename, target_filename):
|
def _download_images(self, tmp_filename, target_filename):
|
||||||
|
|
||||||
(host, path) = ResponseHandler.parse_uri(self.url)
|
(host, path) = ResponseHandler.parse_uri(self.cmd.uri)
|
||||||
with open(tmp_filename, "rb") as fp:
|
with open(tmp_filename, "rb") as fp:
|
||||||
soup = BeautifulSoup(fp, 'lxml')
|
soup = BeautifulSoup(fp, 'lxml')
|
||||||
|
|
||||||
base_url = self.url
|
base_url = self.cmd.uri
|
||||||
base_element = soup.find("base")
|
base_element = soup.find("base")
|
||||||
|
|
||||||
if base_element:
|
if base_element:
|
||||||
@@ -186,58 +230,24 @@ class HTMLDownloadHandler(DownloadHandler):
|
|||||||
tag: Tag
|
tag: Tag
|
||||||
for tag in soup.find_all("img"):
|
for tag in soup.find_all("img"):
|
||||||
try:
|
try:
|
||||||
if tag.has_attr("src"):
|
if not tag.has_attr("src"):
|
||||||
el_name = "src"
|
|
||||||
elif tag.has_attr("data-src"):
|
|
||||||
el_name = "data-src"
|
|
||||||
else:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if tag[el_name] in processed:
|
if tag["src"] in processed:
|
||||||
new_url = processed.get(tag[el_name])
|
new_url = processed.get(tag["src"])
|
||||||
else:
|
else:
|
||||||
new_url = self.__download_image(tag[el_name], host, base_url)
|
new_url = self.__download_image(tag["src"], host, base_url)
|
||||||
processed[tag[el_name]] = new_url
|
processed[tag["src"]] = new_url
|
||||||
if new_url:
|
if new_url:
|
||||||
tag[el_name] = new_url
|
tag["src"] = new_url
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error("Failed to download image: %s, skipping...", tag[el_name], exc_info=e)
|
logging.error("Failed to download image: %s, skipping...", tag["src"], exc_info=e)
|
||||||
|
|
||||||
for tag in soup.find_all("div"):
|
|
||||||
if not tag.has_attr("style"):
|
|
||||||
continue
|
|
||||||
style = cssutils.parseStyle(tag["style"])
|
|
||||||
|
|
||||||
if "background" in style and "url(" in style["background"]:
|
|
||||||
el_name = "background"
|
|
||||||
elif "background-image" in style and "url(" in style["background-image"]:
|
|
||||||
el_name = "background-image"
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
el = style[el_name]
|
|
||||||
start = el.find("url(") + 4
|
|
||||||
end = el.find(")", start)
|
|
||||||
url = el[start:end].strip()
|
|
||||||
|
|
||||||
try:
|
|
||||||
if url in processed:
|
|
||||||
new_url = url
|
|
||||||
else:
|
|
||||||
new_url = self.__download_image(url, host, base_url)
|
|
||||||
processed[url] = new_url
|
|
||||||
if new_url:
|
|
||||||
el = el[:start] + new_url + el[end:]
|
|
||||||
style[el_name] = el
|
|
||||||
tag["style"] = style.cssText
|
|
||||||
except Exception as e:
|
|
||||||
logging.debug("Internal error", exc_info=e)
|
|
||||||
logging.error("Failed to download image: %s, skipping...", tag["src"])
|
|
||||||
|
|
||||||
with open(target_filename, 'w') as file:
|
with open(target_filename, 'w') as file:
|
||||||
file.write(str(soup))
|
file.write(str(soup))
|
||||||
|
|
||||||
def __download_image(self, img_src, host, base_url):
|
def __download_image(self, img_src, host, base_url):
|
||||||
parsed = urlparse(img_src)
|
parsed = urlsplit(img_src)
|
||||||
|
|
||||||
logging.debug("Downloading image: %s", img_src)
|
logging.debug("Downloading image: %s", img_src)
|
||||||
|
|
||||||
@@ -245,36 +255,18 @@ class HTMLDownloadHandler(DownloadHandler):
|
|||||||
# Not a valid url
|
# Not a valid url
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
if parsed.hostname == host:
|
||||||
|
port = self.cmd.port
|
||||||
|
elif ":" in parsed.netloc:
|
||||||
|
port = parsed.netloc.split(":", 1)[1]
|
||||||
|
else:
|
||||||
|
port = 80
|
||||||
|
|
||||||
if len(parsed.netloc) == 0 and parsed.path != "/":
|
if len(parsed.netloc) == 0 and parsed.path != "/":
|
||||||
# relative url, append base_url
|
# relative url, append base_url
|
||||||
img_src = os.path.join(os.path.dirname(base_url), parsed.path)
|
img_src = os.path.join(os.path.dirname(base_url), parsed.path)
|
||||||
|
|
||||||
parsed = urlparse(img_src)
|
command = GetCommand(img_src, port, os.path.dirname(self.path))
|
||||||
|
command.execute(True)
|
||||||
|
|
||||||
# Check if the image is located on the same server
|
return command.filename
|
||||||
if len(parsed.netloc) == 0 or parsed.netloc == host:
|
|
||||||
same_host = True
|
|
||||||
img_host = host
|
|
||||||
img_path = parsed.path
|
|
||||||
else:
|
|
||||||
same_host = False
|
|
||||||
(img_host, img_path) = ResponseHandler.parse_uri(img_src)
|
|
||||||
|
|
||||||
message = f"GET {img_path} HTTP/1.1\r\n"
|
|
||||||
message += "Accept: */*\r\nAccept-Encoding: identity\r\n"
|
|
||||||
message += f"Host: {img_host}\r\n\r\n"
|
|
||||||
message = message.encode(FORMAT)
|
|
||||||
|
|
||||||
if same_host:
|
|
||||||
client = self.client
|
|
||||||
client.reset_request()
|
|
||||||
else:
|
|
||||||
client = HTTPClient(img_src)
|
|
||||||
client.conn.connect((img_host, 80))
|
|
||||||
client.conn.sendall(message)
|
|
||||||
filename = self._handle_sub_request(client, img_host + img_path)
|
|
||||||
|
|
||||||
if not same_host:
|
|
||||||
client.close()
|
|
||||||
|
|
||||||
return filename
|
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import socket
|
import socket
|
||||||
from io import BufferedReader
|
from io import BufferedReader
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
BUFSIZE = 4096
|
BUFSIZE = 4096
|
||||||
TIMEOUT = 3
|
TIMEOUT = 3
|
||||||
@@ -11,7 +12,7 @@ MAXLINE = 4096
|
|||||||
class HTTPSocket:
|
class HTTPSocket:
|
||||||
host: str
|
host: str
|
||||||
conn: socket.socket
|
conn: socket.socket
|
||||||
file: BufferedReader
|
file: Tuple[BufferedReader, None]
|
||||||
|
|
||||||
def __init__(self, conn: socket.socket, host: str):
|
def __init__(self, conn: socket.socket, host: str):
|
||||||
|
|
||||||
@@ -24,8 +25,12 @@ class HTTPSocket:
|
|||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.file.close()
|
self.file.close()
|
||||||
|
# self.conn.shutdown(socket.SHUT_RDWR)
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
|
|
||||||
|
def is_closed(self):
|
||||||
|
return self.file is None
|
||||||
|
|
||||||
def reset_request(self):
|
def reset_request(self):
|
||||||
self.file.close()
|
self.file.close()
|
||||||
self.file = self.conn.makefile("rb")
|
self.file = self.conn.makefile("rb")
|
||||||
|
16
httplib/message.py
Normal file
16
httplib/message.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
|
||||||
|
class Message:
|
||||||
|
version: str
|
||||||
|
status: int
|
||||||
|
msg: str
|
||||||
|
headers: Dict[str, str]
|
||||||
|
body: bytes
|
||||||
|
|
||||||
|
def __init__(self, version: str, status: int, msg: str, headers: Dict[str, str], body: bytes = None):
|
||||||
|
self.version = version
|
||||||
|
self.status = status
|
||||||
|
self.msg = msg
|
||||||
|
self.headers = headers
|
||||||
|
self.body = body
|
@@ -1,6 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse, urlsplit
|
||||||
|
|
||||||
from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest
|
from httplib.exceptions import InvalidStatusLine, InvalidResponse, BadRequest
|
||||||
from httplib.httpsocket import HTTPSocket
|
from httplib.httpsocket import HTTPSocket
|
||||||
@@ -42,6 +42,26 @@ def get_status_line(client: HTTPSocket):
|
|||||||
return version, status, reason
|
return version, status, reason
|
||||||
|
|
||||||
|
|
||||||
|
def parse_status_line(line: str):
|
||||||
|
split = list(filter(None, line.strip().split(" ", 2)))
|
||||||
|
if len(split) < 3:
|
||||||
|
raise InvalidStatusLine(line) # TODO fix exception
|
||||||
|
|
||||||
|
(http_version, status, reason) = split
|
||||||
|
|
||||||
|
if not _is_valid_http_version(http_version):
|
||||||
|
raise InvalidStatusLine(line)
|
||||||
|
version = http_version[:4]
|
||||||
|
|
||||||
|
if not re.match(r"\d{3}", status):
|
||||||
|
raise InvalidStatusLine(line)
|
||||||
|
status = int(status)
|
||||||
|
if status < 100 or status > 999:
|
||||||
|
raise InvalidStatusLine(line)
|
||||||
|
|
||||||
|
return version, status, reason
|
||||||
|
|
||||||
|
|
||||||
def parse_request_line(client: HTTPSocket):
|
def parse_request_line(client: HTTPSocket):
|
||||||
line, (method, target, version) = _get_start_line(client)
|
line, (method, target, version) = _get_start_line(client)
|
||||||
|
|
||||||
@@ -157,6 +177,38 @@ def get_headers(client: HTTPSocket):
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def parse_headers(lines):
|
||||||
|
headers = []
|
||||||
|
# first header after the status-line may not contain a space
|
||||||
|
for line in lines:
|
||||||
|
line = next(lines)
|
||||||
|
if line[0].isspace():
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if line in ("\r\n", "\n", " "):
|
||||||
|
break
|
||||||
|
|
||||||
|
if line[0].isspace():
|
||||||
|
headers[-1] = headers[-1].rstrip("\r\n")
|
||||||
|
|
||||||
|
headers.append(line.lstrip())
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
header_str = "".join(headers)
|
||||||
|
for line in header_str.splitlines():
|
||||||
|
pos = line.find(":")
|
||||||
|
|
||||||
|
if pos <= 0 or pos >= len(line) - 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
(header, value) = map(str.strip, line.split(":", 1))
|
||||||
|
check_next_header(result, header, value)
|
||||||
|
result[header.lower()] = value.lower()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def check_next_header(headers, next_header: str, next_value: str):
|
def check_next_header(headers, next_header: str, next_value: str):
|
||||||
if next_header == "content-length":
|
if next_header == "content-length":
|
||||||
@@ -166,3 +218,25 @@ def check_next_header(headers, next_header: str, next_value: str):
|
|||||||
if not next_value.isnumeric() or int(next_value) <= 0:
|
if not next_value.isnumeric() or int(next_value) <= 0:
|
||||||
logging.error("Invalid content-length value: %r", next_value)
|
logging.error("Invalid content-length value: %r", next_value)
|
||||||
raise InvalidResponse()
|
raise InvalidResponse()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_uri(uri: str):
|
||||||
|
parsed = urlsplit(uri)
|
||||||
|
|
||||||
|
# If there is no netloc, the given string is not a valid URI, so split on /
|
||||||
|
if parsed.hostname:
|
||||||
|
host = parsed.hostname
|
||||||
|
path = parsed.path
|
||||||
|
if parsed.query != '':
|
||||||
|
path = f"{path}?{parsed.query}"
|
||||||
|
else:
|
||||||
|
(host, path) = uri.split("/", 1)
|
||||||
|
|
||||||
|
if ":" in host:
|
||||||
|
host, port = host.split(":", 1)
|
||||||
|
elif parsed.scheme == "https":
|
||||||
|
port = 443
|
||||||
|
else:
|
||||||
|
port = 80
|
||||||
|
|
||||||
|
return host, port, path
|
||||||
|
@@ -42,6 +42,28 @@ class Retriever(ABC):
|
|||||||
return ContentLengthRetriever(client, int(content_length))
|
return ContentLengthRetriever(client, int(content_length))
|
||||||
|
|
||||||
|
|
||||||
|
class PreambleRetriever(Retriever):
|
||||||
|
client: HTTPSocket
|
||||||
|
buffer: []
|
||||||
|
|
||||||
|
def __init__(self, client: HTTPSocket):
|
||||||
|
super().__init__(client)
|
||||||
|
self.client = client
|
||||||
|
self.buffer = []
|
||||||
|
|
||||||
|
def retrieve(self):
|
||||||
|
|
||||||
|
line = self.client.read_line()
|
||||||
|
while True:
|
||||||
|
self.buffer.append(line)
|
||||||
|
|
||||||
|
if line in ("\r\n", "\n", " "):
|
||||||
|
break
|
||||||
|
|
||||||
|
yield line
|
||||||
|
line = self.client.read_line()
|
||||||
|
|
||||||
|
|
||||||
class ContentLengthRetriever(Retriever):
|
class ContentLengthRetriever(Retriever):
|
||||||
length: int
|
length: int
|
||||||
|
|
||||||
@@ -63,21 +85,16 @@ class ContentLengthRetriever(Retriever):
|
|||||||
buffer = self.client.read(remaining)
|
buffer = self.client.read(remaining)
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
logging.error("Timed out before receiving complete payload")
|
logging.error("Timed out before receiving complete payload")
|
||||||
self.client.close()
|
|
||||||
raise IncompleteResponse("Timed out before receiving complete payload")
|
raise IncompleteResponse("Timed out before receiving complete payload")
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
logging.error("Timed out before receiving complete payload")
|
logging.error("Timed out before receiving complete payload")
|
||||||
self.client.close()
|
|
||||||
raise IncompleteResponse("Connection closed before receiving complete payload")
|
raise IncompleteResponse("Connection closed before receiving complete payload")
|
||||||
|
|
||||||
logging.debug("Received payload length: %s", len(buffer))
|
|
||||||
|
|
||||||
if len(buffer) == 0:
|
if len(buffer) == 0:
|
||||||
logging.warning("Received payload length %s less than expected %s", cur_payload_size, self.length)
|
logging.warning("Received payload length %s less than expected %s", cur_payload_size, self.length)
|
||||||
break
|
break
|
||||||
|
|
||||||
cur_payload_size += len(buffer)
|
cur_payload_size += len(buffer)
|
||||||
logging.debug("Processed payload: %r", cur_payload_size)
|
|
||||||
yield buffer
|
yield buffer
|
||||||
|
|
||||||
return b""
|
return b""
|
||||||
@@ -108,7 +125,6 @@ class ChunkedRetriever(Retriever):
|
|||||||
yield buffer
|
yield buffer
|
||||||
|
|
||||||
self.client.read_line() # remove CRLF
|
self.client.read_line() # remove CRLF
|
||||||
return b""
|
|
||||||
|
|
||||||
def __get_chunk_size(self):
|
def __get_chunk_size(self):
|
||||||
line = self.client.read_line()
|
line = self.client.read_line()
|
||||||
|
@@ -1,5 +1,4 @@
|
|||||||
import logging
|
import logging
|
||||||
import multiprocessing
|
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
import threading
|
import threading
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
@@ -69,7 +68,7 @@ class Worker:
|
|||||||
|
|
||||||
handler = RequestHandler(conn, self.host)
|
handler = RequestHandler(conn, self.host)
|
||||||
handler.listen()
|
handler.listen()
|
||||||
except Exception as e:
|
except Exception:
|
||||||
logging.debug("Internal error")
|
logging.debug("Internal error")
|
||||||
|
|
||||||
conn.shutdown(socket.SHUT_RDWR)
|
conn.shutdown(socket.SHUT_RDWR)
|
||||||
|
Reference in New Issue
Block a user