Source code for bookbuilderpy.url
"""Loading of data from urls."""
from typing import Final, cast
import certifi
import urllib3 # type: ignore
from bookbuilderpy.path import UTF8
from bookbuilderpy.strings import (
enforce_non_empty_str,
enforce_non_empty_str_without_ws,
)
#: The shared HTTP pool
__HTTP: Final[urllib3.PoolManager] = urllib3.PoolManager(
cert_reqs="CERT_REQUIRED", ca_certs=certifi.where())
def __name(request: urllib3.HTTPResponse) -> str:
"""
Extract the file name from a request.
:param request: the request
:return: the file name
"""
content_disp: str = "Content-Disposition"
if content_disp in request.headers:
content_disp = request.headers[content_disp]
i: int = content_disp.find("filename")
if i >= 0:
i = content_disp.find("=", i + 1)
if i > 0:
k = content_disp.find('"', i + 1)
if k > 0:
k += 1
j = content_disp.find('"', k)
if j > k:
return enforce_non_empty_str_without_ws(
content_disp[k:j])
else:
k = content_disp.find("'", i + 1)
if k > 0:
k += 1
j = content_disp.find("'", k)
if j > k:
return enforce_non_empty_str_without_ws(
content_disp[k:j])
else:
return enforce_non_empty_str_without_ws(
content_disp[i + 1:])
_url = enforce_non_empty_str(request.geturl())
url = _url
last = url.rfind("#")
if last > 0:
url = url[:last]
last = url.rfind("?")
if last > 0:
url = url[:last]
first = url.rfind("/")
if first < 0:
raise ValueError(f"Invalid URL '{_url}'.")
return enforce_non_empty_str_without_ws(url[first + 1:])
[docs]def load_binary_from_url(url: str) -> tuple[str, bytes]:
"""
Load all the binary data from one url.
:param url: the url
:return: a tuple of the file name and the binary data that was loaded
"""
request: urllib3.HTTPResponse = cast(
urllib3.HTTPResponse, __HTTP.request("GET", url))
if request.status != 200:
raise ValueError(
f"Error '{request.status}' when downloading url '{url}'.")
data = request.data
name = __name(request)
request.close()
return name, data
[docs]def load_text_from_url(url: str) -> tuple[str, str]:
"""
Load all the text from one url.
:param url: the url
:return: a tuple of the file name and the text that was loaded
"""
name, data = load_binary_from_url(url)
return name, data.decode(UTF8)