"""Post-process HTML files."""
import base64
import os
import string
from os.path import exists
from re import MULTILINE
from re import compile as _compile
from typing import Final, Pattern
import bs4 # type: ignore
import minify_html # type: ignore
import regex as reg # type: ignore
from selenium import webdriver # type: ignore
from selenium.webdriver.firefox.service import Service
from bookbuilderpy.logger import logger
from bookbuilderpy.path import UTF8, Path, move_pure
from bookbuilderpy.strings import enforce_non_empty_str, regex_sub
from bookbuilderpy.temp import TempDir
from bookbuilderpy.versions import TOOL_FIREFOX, TOOL_FIREFOX_DRIVER, has_tool
#: the regexes for java script
__REGEXES_URI_JAVASCRIPT: Final[tuple[reg.Regex, ...]] = tuple(
[reg.compile( # nosemgrep
f'<script src=\"data:application/{x}' # nosemgrep
'(;\\s*charset=utf-8)?;base64,' # nosemgrep
'((?:[A-Za-z0-9+\\/]{4})*(?:[A-Za-z0-9+\\/]' # nosemgrep
'{4}|[A-Za-z0-9+\\/]{3}=' # nosemgrep
'|[A-Za-z0-9+\\/]{2}={2}))' # nosemgrep
'\"(\\s+type="text/javascript")?' # nosemgrep
f'{y}', # nosemgrep
flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101
for x in ["octet-stream", "javascript"]
for y in ["\\s*/>", ">\\s*</script>"]],
)
#: the regexes for css
__REGEXES_URI_CSS: Final[tuple[reg.Regex, ...]] = tuple(
[reg.compile( # nosemgrep
'<link rel=\"stylesheet\" ' # nosemgrep
f'href=\"data:application/{x}(;' # nosemgrep
'\\s*charset=utf-8)?;base64,((?:[A-Za-z0-9' # nosemgrep
'+\\/]{4})*(?:[A-Za-z' # nosemgrep
'0-9+\\/]{4}|[A-Za-z0-9+\\/]{3}=|' # nosemgrep
'[A-Za-z0-9+\\/]{2}={2}))\"' # nosemgrep
'(\\s+type="text/css")?' # nosemgrep
f'{y}', # nosemgrep
flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101
for x in ["octet-stream"]
for y in ["\\s*/>", ">\\s*</link>"]],
)
def __base64_unpacker(args, start: str, end: str) -> str:
"""
Convert the base64 encoded text to normal text.
:param args: the arguments
:param start: the start tag
:param end: the end tag
:return: the text
"""
decoded = base64.b64decode(str(args.groups()[1]).strip()).decode(UTF8)
res = f"{start}{decoded.strip()}{end}"
if len(res) <= (args.end() - args.start()):
return res
return str(args).strip()
def __base64_unpacker_js(args) -> str:
"""
Convert the base64 encoded javascript to normal text.
This does not seem to work?
:param args: the arguments
:return: the text
"""
return __base64_unpacker(args, '<script type="text/javascript">',
"</script>")
def __base64_unpacker_css(args) -> str:
"""
Convert the base64 encoded css to normal text.
This does not seem to work?
:param args: the arguments
:return: the text
"""
return __base64_unpacker(args, '<style type="text/css">', "</style>")
def __unpack_data_uris(text: str) -> str:
"""
Unpack all javascript data urls.
:param text: the original html text
:return: the text with all scripts expanded
"""
for regex in __REGEXES_URI_JAVASCRIPT:
while True:
tn = reg.sub(regex, __base64_unpacker_js, text)
if tn is text:
break
text = tn
for regex in __REGEXES_URI_CSS:
while True:
tn = reg.sub(regex, __base64_unpacker_css, text)
if tn is text:
break
text = tn
return text
# noinspection PyBroadException
[docs]def html_postprocess(in_file: str,
out_file: str,
flatten_data_uris: bool = True,
fully_evaluate_html: bool = False,
purge_scripts: bool = False,
minify: bool = True,
purge_mathjax: bool = True,
canonicalize_ids: bool = True,
overwrite: bool = False) -> Path:
"""
Post-process a html file.
:param in_file: the input file
:param out_file: the output file
:param flatten_data_uris: should we flatten data URIs?
:param fully_evaluate_html: should we use selenium to fully evaluate
all html and javascript?
:param purge_scripts: should we purge all javascripts from the file?
:param minify: should we minify the HTML output?
:param purge_mathjax: purge all mathjax stuff?
:param canonicalize_ids: should we canonicalize the IDs?
:param overwrite: should the output file be overwritten if it exists?
:return: the output file
"""
source = Path.file(in_file)
output = Path.path(out_file)
logger(f"post-processing HTML file from '{source}' to '{output}'.")
if (not overwrite) and exists(output):
raise ValueError(f"Output file '{output}' already exists.")
if source == output:
raise ValueError(f"Input and output file is the same: '{source}'.")
current_file: Path = source
needs_file_out: bool = False
text: str = enforce_non_empty_str(source.read_all_str().strip())
with TempDir.create() as temp:
if flatten_data_uris: # flatten data uris
text_n = enforce_non_empty_str(__unpack_data_uris(text))
if text_n != text:
text = text_n
needs_file_out = True
logger("flattening the data uris changed the HTML content.")
else:
logger("flattening the data uris did not change the "
"HTML content.")
del text_n
if fully_evaluate_html: # flatten scripts and html
if has_tool(TOOL_FIREFOX_DRIVER) and has_tool(TOOL_FIREFOX):
options = webdriver.FirefoxOptions()
options.add_argument("--enable-javascript")
options.add_argument("-headless")
service = Service(log_path=os.path.devnull)
try:
browser = webdriver.Firefox(
options=options, service=service)
except BaseException:
options.binary_location = TOOL_FIREFOX
browser = webdriver.Firefox(
options=options, service=service)
if needs_file_out:
current_file = temp.resolve_inside("1.html")
current_file.write_all(text)
needs_file_out = False
current_file.enforce_file()
logger(f"invoking '{TOOL_FIREFOX_DRIVER}' via selenium on "
f"'{current_file}' to evaluate HTML.")
browser.get("file:///" + current_file)
browser.implicitly_wait(1)
html = browser.page_source
browser.quit()
html = html.strip()
if not html:
raise ValueError("Browser returned empty html.")
if not html.startswith("<!"):
html = "<!DOCTYPE HTML>" + html
if html != text:
needs_file_out = True
text = html
logger("html evaluation did change something.")
else:
logger("html evaluation changed nothing.")
del html
else:
logger(f"cannot use HTML evaluation, '{TOOL_FIREFOX}' or '"
f"{TOOL_FIREFOX_DRIVER}' not present.")
if minify or canonicalize_ids or purge_scripts: # minify output
ntext = enforce_non_empty_str(__html_crusher(
text, canonicalize_ids=canonicalize_ids,
purge_mathjax=purge_mathjax,
minify=minify,
purge_scripts=purge_scripts))
if ntext != text:
needs_file_out = True
text = ntext
logger("html minification has changed the content.")
else:
logger("html minification had no impact")
del ntext
if needs_file_out:
logger(f"writing post-processing result to '{output}'.")
output.write_all(text)
elif current_file == source:
logger(f"copying HTML from '{source}' to '{output}'.")
Path.copy_file(source, output)
else:
logger(f"moving HTML from '{current_file}' to '{output}'.")
move_pure(current_file, output)
output.enforce_file()
return output
def __inner_minify(parsed: bs4.BeautifulSoup) -> None:
"""
Execute the inner HTML minification routine.
This routine can be applied before and after ID normalization.
:param bs4.BeautifulSoup parsed: the tags to process
"""
# try to discover and purge useless references
for tag in parsed("span"):
if "id" in tag.attrs:
tagid = tag.attrs["id"]
if tag.contents:
child = tag.contents[0]
if child.name == "a" and "href" in child.attrs:
ref = child.attrs["href"]
if ref.startswith("#") and (ref[1:] == tagid) \
and (not (child.contents or child.string)):
del child.attrs["href"]
# replace tags with their children if they have no attributes
# or other contents
for name in ["span", "div", "g"]:
for tag in reversed(list(parsed(name))):
if tag.contents and (len(tag.contents) == 1) and \
(not tag.string):
child = tag.contents[0]
if child.name == name:
if not tag.attrs:
tag.replace_with(child)
continue
if not child.attrs:
child.attrs = tag.attrs
tag.replace_with(child)
continue
if (list(tag.attrs.keys()) == ["id"]) \
^ (list(child.attrs.keys()) == ["id"]):
child.attrs.update(tag.attrs)
tag.replace_with(child)
continue
#: the useless spans pattern
__USELESS_SPANS: Final[Pattern] = _compile(
r"<span>([a-zA-Z0-9 \t\n,;.:-_#'+~*^°!\"§$%&/()[]{}=?\\?`@|>]*?)</span>",
MULTILINE)
def __html_crusher(text: str,
canonicalize_ids: bool = True,
purge_mathjax: bool = True,
minify: bool = True,
purge_scripts: bool = False) -> str:
"""
Crush the html content.
:param text: the text coming in
:param canonicalize_ids: should we canonicalize the IDs?
:param purge_mathjax: purge all mathjax stuff?
:param minify: should we minify the HTML output?
:param purge_scripts: should we purge all javascripts?
:return: the crushed html text
"""
parsed: bs4.BeautifulSoup = bs4.BeautifulSoup(text, "html.parser")
# remove the useless mathjax content
if purge_mathjax:
# delete useless mathml content
for tag in parsed("mjx-assistive-mml"):
tag.decompose()
# delete useless components of tags
for tag in parsed("use"):
if "data-c" in tag.attrs:
del tag.attrs["data-c"]
for tag in parsed("g"):
if "data-mml-node" in tag.attrs:
del tag.attrs["data-mml-node"]
if "data-mjx-texclass" in tag.attrs:
del tag.attrs["data-mjx-texclass"]
for tag in parsed("mjx-container"):
if "class" in tag.attrs:
clz = " ".join(tag.attrs["class"])
clzn = clz.replace(" CtxtMenu_Attached_0", "") \
.replace("CtxtMenu_Attached_0 ", "") \
.replace(" ", " ").strip()
if clzn != clz:
tag.attrs["class"] = clzn
if "ctxtmenu_counter" in tag.attrs:
del tag.attrs["ctxtmenu_counter"]
if "tabindex" in tag.attrs:
del tag.attrs["tabindex"]
# purge useless context menu styles
for tag in parsed("style"):
tagtext = tag.string
if ".CtxtMenu_" in tagtext:
tag.decompose()
continue
found = False
while True:
idx1 = tagtext.find("mjx-assistive-mml")
if idx1 < 0:
break
idx2 = tagtext.find("{", idx1)
if idx2 <= idx1:
break
idx3 = tagtext.find("}", idx2)
if idx3 <= idx2:
break
tagtext = tagtext[:idx1].strip() + \
tagtext[(idx3 + 1):].strip()
found = True
if found:
tag.string = tagtext
# purge all scripts
if purge_scripts:
for tag in parsed("script"):
tag.decompose()
if minify:
# merge all styles
styles = parsed("style")
if len(styles) > 1:
all_styles = "".join(tag.string.strip() for tag in styles)
for tag in styles[1:]:
tag.decompose()
styles[0].string = all_styles
# remove the generator meta data, as it is not needed
for tag in parsed("meta"):
if "name" in tag.attrs and tag.attrs["name"] == "generator":
tag.decompose()
__inner_minify(parsed)
# replace all ids with shorter ids
if canonicalize_ids:
# first, we try to minify the element IDs
id_counts: dict[str, int] = {}
# find all IDs
for ref in ["id", "name"]:
for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):
a = tag.attrs[ref]
if len(a) <= 0:
del tag.attrs[ref]
continue
if (tag.name.lower() == "meta") and (ref == "name"):
continue
if a in id_counts:
raise ValueError(
f"id '{a}' in '{ref}' of tag '{tag}' appears twice!")
id_counts[a] = 0
# count the references to them
for ref in ["href", "xlink:href"]:
for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):
a = tag.attrs[ref]
if a.startswith("#"):
a = a[1:].strip()
if a not in id_counts:
raise ValueError("Found reference to undefined id "
f"'{a}' of tag '{tag}'.")
id_counts[a] += 1
# purge all unreferenced ids
id_list = [(tid, count) for (tid, count) in id_counts.items()
if count > 0]
del id_counts
# create smaller IDs
id_list.sort(key=lambda x: -x[1])
ids: dict[str, str] = {}
cnt: int = 0
for idx in id_list:
ids[idx[0]] = __int2str(cnt)
cnt += 1
del id_list, cnt
# write back the ids
for ref in ["id", "name"]:
for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):
if (tag.name.lower() == "meta") and (ref == "name"):
continue
tid = tag.attrs[ref]
if tid in ids:
tag.attrs[ref] = ids[tid]
else:
del tag.attrs[ref]
# re-link the references
for ref in ["href", "xlink:href"]:
for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):
a = tag.attrs[ref]
if a.startswith("#"):
a = a[1:].strip()
if a not in ids:
raise ValueError(
f"Found reference to deleted id '{a}'.")
tag.attrs[ref] = f"#{ids[a]}"
# Since we have minified IDs, we may have purged useless IDs.
# Thus, maybe we can now purge additional tags.
if minify:
__inner_minify(parsed)
# convert the parsed html back to text and check if it is smaller
ntext = parsed.__unicode__()
if len(ntext) < len(text):
text = ntext
# apply the final minification step
if minify:
ntext = enforce_non_empty_str(
minify_html.minify( # pylint: disable=E1101
text, do_not_minify_doctype=True,
ensure_spec_compliant_unquoted_attribute_values=True,
remove_bangs=True,
remove_processing_instructions=True,
keep_html_and_head_opening_tags=True,
keep_spaces_between_attributes=True,
minify_css=True,
minify_js=True).strip())
if len(ntext) < len(text):
text = ntext
text = regex_sub(__USELESS_SPANS, "\\1", text)
return text
#: the internal start digits that can be used for it to string conversation
__DIGITS_START = string.ascii_letters
#: the internal digits that can be used for it to string conversation
__DIGITS = __DIGITS_START + string.digits + "-_"
def __int2str(x: int) -> str:
"""
Convert an integer to a string.
:param x: the integer
:return: the compact string
"""
if x == 0:
return __DIGITS_START[0]
digits: list[str] = []
use_digits = __DIGITS_START
while x:
base = len(use_digits)
digits.append(use_digits[x % base])
x = x // base
use_digits = __DIGITS
return "".join(digits)