Source code for bookbuilderpy.html

"""Post-process HTML files."""
import base64
import os
import string
from os.path import exists
from re import MULTILINE
from re import compile as _compile
from typing import Final, Pattern

import bs4  # type: ignore
import minify_html  # type: ignore
import regex as reg  # type: ignore
from selenium import webdriver  # type: ignore
from selenium.webdriver.firefox.service import Service

from bookbuilderpy.logger import logger
from bookbuilderpy.path import UTF8, Path, move_pure
from bookbuilderpy.strings import enforce_non_empty_str, regex_sub
from bookbuilderpy.temp import TempDir
from bookbuilderpy.versions import TOOL_FIREFOX, TOOL_FIREFOX_DRIVER, has_tool

#: the regexes for java script
__REGEXES_URI_JAVASCRIPT: Final[tuple[reg.Regex, ...]] = tuple(
    [reg.compile(  # nosemgrep
        f'<script src=\"data:application/{x}'  # nosemgrep
        '(;\\s*charset=utf-8)?;base64,'  # nosemgrep
        '((?:[A-Za-z0-9+\\/]{4})*(?:[A-Za-z0-9+\\/]'  # nosemgrep
        '{4}|[A-Za-z0-9+\\/]{3}='  # nosemgrep
        '|[A-Za-z0-9+\\/]{2}={2}))'  # nosemgrep
        '\"(\\s+type="text/javascript")?'  # nosemgrep
        f'{y}',  # nosemgrep
        flags=reg.V1 | reg.MULTILINE)  # pylint: disable=E1101
        for x in ["octet-stream", "javascript"]
        for y in ["\\s*/>", ">\\s*</script>"]],
)

#: the regexes for css
__REGEXES_URI_CSS: Final[tuple[reg.Regex, ...]] = tuple(
    [reg.compile(  # nosemgrep
        '<link rel=\"stylesheet\" '  # nosemgrep
        f'href=\"data:application/{x}(;'  # nosemgrep
        '\\s*charset=utf-8)?;base64,((?:[A-Za-z0-9'  # nosemgrep
        '+\\/]{4})*(?:[A-Za-z'  # nosemgrep
        '0-9+\\/]{4}|[A-Za-z0-9+\\/]{3}=|'  # nosemgrep
        '[A-Za-z0-9+\\/]{2}={2}))\"'  # nosemgrep
        '(\\s+type="text/css")?'  # nosemgrep
        f'{y}',  # nosemgrep
        flags=reg.V1 | reg.MULTILINE)  # pylint: disable=E1101
        for x in ["octet-stream"]
        for y in ["\\s*/>", ">\\s*</link>"]],
)


def __base64_unpacker(args, start: str, end: str) -> str:
    """
    Convert the base64 encoded text to normal text.

    :param args: the arguments
    :param start: the start tag
    :param end: the end tag
    :return: the text
    """
    decoded = base64.b64decode(str(args.groups()[1]).strip()).decode(UTF8)
    res = f"{start}{decoded.strip()}{end}"
    if len(res) <= (args.end() - args.start()):
        return res
    return str(args).strip()


def __base64_unpacker_js(args) -> str:
    """
    Convert the base64 encoded javascript to normal text.

    This does not seem to work?

    :param args: the arguments
    :return: the text
    """
    return __base64_unpacker(args, '<script type="text/javascript">',
                             "</script>")


def __base64_unpacker_css(args) -> str:
    """
    Convert the base64 encoded css to normal text.

    This does not seem to work?

    :param args: the arguments
    :return: the text
    """
    return __base64_unpacker(args, '<style type="text/css">', "</style>")


def __unpack_data_uris(text: str) -> str:
    """
    Unpack all javascript data urls.

    :param text: the original html text
    :return: the text with all scripts expanded
    """
    for regex in __REGEXES_URI_JAVASCRIPT:
        while True:
            tn = reg.sub(regex, __base64_unpacker_js, text)
            if tn is text:
                break
            text = tn
    for regex in __REGEXES_URI_CSS:
        while True:
            tn = reg.sub(regex, __base64_unpacker_css, text)
            if tn is text:
                break
            text = tn
    return text


# noinspection PyBroadException
[docs]def html_postprocess(in_file: str, out_file: str, flatten_data_uris: bool = True, fully_evaluate_html: bool = False, purge_scripts: bool = False, minify: bool = True, purge_mathjax: bool = True, canonicalize_ids: bool = True, overwrite: bool = False) -> Path: """ Post-process a html file. :param in_file: the input file :param out_file: the output file :param flatten_data_uris: should we flatten data URIs? :param fully_evaluate_html: should we use selenium to fully evaluate all html and javascript? :param purge_scripts: should we purge all javascripts from the file? :param minify: should we minify the HTML output? :param purge_mathjax: purge all mathjax stuff? :param canonicalize_ids: should we canonicalize the IDs? :param overwrite: should the output file be overwritten if it exists? :return: the output file """ source = Path.file(in_file) output = Path.path(out_file) logger(f"post-processing HTML file from '{source}' to '{output}'.") if (not overwrite) and exists(output): raise ValueError(f"Output file '{output}' already exists.") if source == output: raise ValueError(f"Input and output file is the same: '{source}'.") current_file: Path = source needs_file_out: bool = False text: str = enforce_non_empty_str(source.read_all_str().strip()) with TempDir.create() as temp: if flatten_data_uris: # flatten data uris text_n = enforce_non_empty_str(__unpack_data_uris(text)) if text_n != text: text = text_n needs_file_out = True logger("flattening the data uris changed the HTML content.") else: logger("flattening the data uris did not change the " "HTML content.") del text_n if fully_evaluate_html: # flatten scripts and html if has_tool(TOOL_FIREFOX_DRIVER) and has_tool(TOOL_FIREFOX): options = webdriver.FirefoxOptions() options.add_argument("--enable-javascript") options.add_argument("-headless") service = Service(log_path=os.path.devnull) try: browser = webdriver.Firefox( options=options, service=service) except BaseException: options.binary_location = TOOL_FIREFOX browser = webdriver.Firefox( options=options, service=service) if needs_file_out: current_file = temp.resolve_inside("1.html") current_file.write_all(text) needs_file_out = False current_file.enforce_file() logger(f"invoking '{TOOL_FIREFOX_DRIVER}' via selenium on " f"'{current_file}' to evaluate HTML.") browser.get("file:///" + current_file) browser.implicitly_wait(1) html = browser.page_source browser.quit() html = html.strip() if not html: raise ValueError("Browser returned empty html.") if not html.startswith("<!"): html = "<!DOCTYPE HTML>" + html if html != text: needs_file_out = True text = html logger("html evaluation did change something.") else: logger("html evaluation changed nothing.") del html else: logger(f"cannot use HTML evaluation, '{TOOL_FIREFOX}' or '" f"{TOOL_FIREFOX_DRIVER}' not present.") if minify or canonicalize_ids or purge_scripts: # minify output ntext = enforce_non_empty_str(__html_crusher( text, canonicalize_ids=canonicalize_ids, purge_mathjax=purge_mathjax, minify=minify, purge_scripts=purge_scripts)) if ntext != text: needs_file_out = True text = ntext logger("html minification has changed the content.") else: logger("html minification had no impact") del ntext if needs_file_out: logger(f"writing post-processing result to '{output}'.") output.write_all(text) elif current_file == source: logger(f"copying HTML from '{source}' to '{output}'.") Path.copy_file(source, output) else: logger(f"moving HTML from '{current_file}' to '{output}'.") move_pure(current_file, output) output.enforce_file() return output
def __inner_minify(parsed: bs4.BeautifulSoup) -> None: """ Execute the inner HTML minification routine. This routine can be applied before and after ID normalization. :param bs4.BeautifulSoup parsed: the tags to process """ # try to discover and purge useless references for tag in parsed("span"): if "id" in tag.attrs: tagid = tag.attrs["id"] if tag.contents: child = tag.contents[0] if child.name == "a" and "href" in child.attrs: ref = child.attrs["href"] if ref.startswith("#") and (ref[1:] == tagid) \ and (not (child.contents or child.string)): del child.attrs["href"] # replace tags with their children if they have no attributes # or other contents for name in ["span", "div", "g"]: for tag in reversed(list(parsed(name))): if tag.contents and (len(tag.contents) == 1) and \ (not tag.string): child = tag.contents[0] if child.name == name: if not tag.attrs: tag.replace_with(child) continue if not child.attrs: child.attrs = tag.attrs tag.replace_with(child) continue if (list(tag.attrs.keys()) == ["id"]) \ ^ (list(child.attrs.keys()) == ["id"]): child.attrs.update(tag.attrs) tag.replace_with(child) continue #: the useless spans pattern __USELESS_SPANS: Final[Pattern] = _compile( r"<span>([a-zA-Z0-9 \t\n,;.:-_#'+~*^°!\"§$%&/()[]{}=?\\?`@|>]*?)</span>", MULTILINE) def __html_crusher(text: str, canonicalize_ids: bool = True, purge_mathjax: bool = True, minify: bool = True, purge_scripts: bool = False) -> str: """ Crush the html content. :param text: the text coming in :param canonicalize_ids: should we canonicalize the IDs? :param purge_mathjax: purge all mathjax stuff? :param minify: should we minify the HTML output? :param purge_scripts: should we purge all javascripts? :return: the crushed html text """ parsed: bs4.BeautifulSoup = bs4.BeautifulSoup(text, "html.parser") # remove the useless mathjax content if purge_mathjax: # delete useless mathml content for tag in parsed("mjx-assistive-mml"): tag.decompose() # delete useless components of tags for tag in parsed("use"): if "data-c" in tag.attrs: del tag.attrs["data-c"] for tag in parsed("g"): if "data-mml-node" in tag.attrs: del tag.attrs["data-mml-node"] if "data-mjx-texclass" in tag.attrs: del tag.attrs["data-mjx-texclass"] for tag in parsed("mjx-container"): if "class" in tag.attrs: clz = " ".join(tag.attrs["class"]) clzn = clz.replace(" CtxtMenu_Attached_0", "") \ .replace("CtxtMenu_Attached_0 ", "") \ .replace(" ", " ").strip() if clzn != clz: tag.attrs["class"] = clzn if "ctxtmenu_counter" in tag.attrs: del tag.attrs["ctxtmenu_counter"] if "tabindex" in tag.attrs: del tag.attrs["tabindex"] # purge useless context menu styles for tag in parsed("style"): tagtext = tag.string if ".CtxtMenu_" in tagtext: tag.decompose() continue found = False while True: idx1 = tagtext.find("mjx-assistive-mml") if idx1 < 0: break idx2 = tagtext.find("{", idx1) if idx2 <= idx1: break idx3 = tagtext.find("}", idx2) if idx3 <= idx2: break tagtext = tagtext[:idx1].strip() + \ tagtext[(idx3 + 1):].strip() found = True if found: tag.string = tagtext # purge all scripts if purge_scripts: for tag in parsed("script"): tag.decompose() if minify: # merge all styles styles = parsed("style") if len(styles) > 1: all_styles = "".join(tag.string.strip() for tag in styles) for tag in styles[1:]: tag.decompose() styles[0].string = all_styles # remove the generator meta data, as it is not needed for tag in parsed("meta"): if "name" in tag.attrs and tag.attrs["name"] == "generator": tag.decompose() __inner_minify(parsed) # replace all ids with shorter ids if canonicalize_ids: # first, we try to minify the element IDs id_counts: dict[str, int] = {} # find all IDs for ref in ["id", "name"]: for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs): a = tag.attrs[ref] if len(a) <= 0: del tag.attrs[ref] continue if (tag.name.lower() == "meta") and (ref == "name"): continue if a in id_counts: raise ValueError( f"id '{a}' in '{ref}' of tag '{tag}' appears twice!") id_counts[a] = 0 # count the references to them for ref in ["href", "xlink:href"]: for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs): a = tag.attrs[ref] if a.startswith("#"): a = a[1:].strip() if a not in id_counts: raise ValueError("Found reference to undefined id " f"'{a}' of tag '{tag}'.") id_counts[a] += 1 # purge all unreferenced ids id_list = [(tid, count) for (tid, count) in id_counts.items() if count > 0] del id_counts # create smaller IDs id_list.sort(key=lambda x: -x[1]) ids: dict[str, str] = {} cnt: int = 0 for idx in id_list: ids[idx[0]] = __int2str(cnt) cnt += 1 del id_list, cnt # write back the ids for ref in ["id", "name"]: for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs): if (tag.name.lower() == "meta") and (ref == "name"): continue tid = tag.attrs[ref] if tid in ids: tag.attrs[ref] = ids[tid] else: del tag.attrs[ref] # re-link the references for ref in ["href", "xlink:href"]: for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs): a = tag.attrs[ref] if a.startswith("#"): a = a[1:].strip() if a not in ids: raise ValueError( f"Found reference to deleted id '{a}'.") tag.attrs[ref] = f"#{ids[a]}" # Since we have minified IDs, we may have purged useless IDs. # Thus, maybe we can now purge additional tags. if minify: __inner_minify(parsed) # convert the parsed html back to text and check if it is smaller ntext = parsed.__unicode__() if len(ntext) < len(text): text = ntext # apply the final minification step if minify: ntext = enforce_non_empty_str( minify_html.minify( # pylint: disable=E1101 text, do_not_minify_doctype=True, ensure_spec_compliant_unquoted_attribute_values=True, remove_bangs=True, remove_processing_instructions=True, keep_html_and_head_opening_tags=True, keep_spaces_between_attributes=True, minify_css=True, minify_js=True).strip()) if len(ntext) < len(text): text = ntext text = regex_sub(__USELESS_SPANS, "\\1", text) return text #: the internal start digits that can be used for it to string conversation __DIGITS_START = string.ascii_letters #: the internal digits that can be used for it to string conversation __DIGITS = __DIGITS_START + string.digits + "-_" def __int2str(x: int) -> str: """ Convert an integer to a string. :param x: the integer :return: the compact string """ if x == 0: return __DIGITS_START[0] digits: list[str] = [] use_digits = __DIGITS_START while x: base = len(use_digits) digits.append(use_digits[x % base]) x = x // base use_digits = __DIGITS return "".join(digits)