Coverage for bookbuilderpy/html.py: 40%
268 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-17 23:15 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-17 23:15 +0000
1"""Post-process HTML files."""
2import base64
3import os
4import string
5from os.path import exists
6from re import MULTILINE
7from re import compile as _compile
8from typing import Final, Pattern
10import bs4 # type: ignore
11import minify_html # type: ignore
12import regex as reg # type: ignore
13from selenium import webdriver # type: ignore
14from selenium.webdriver.firefox.service import Service
16from bookbuilderpy.logger import logger
17from bookbuilderpy.path import UTF8, Path, move_pure
18from bookbuilderpy.strings import enforce_non_empty_str, regex_sub
19from bookbuilderpy.temp import TempDir
20from bookbuilderpy.versions import TOOL_FIREFOX, TOOL_FIREFOX_DRIVER, has_tool
22#: the regexes for java script
23__REGEXES_URI_JAVASCRIPT: Final[tuple[reg.Regex, ...]] = tuple(
24 [reg.compile( # nosemgrep
25 f'<script src=\"data:application/{x}' # nosemgrep
26 '(;\\s*charset=utf-8)?;base64,' # nosemgrep
27 '((?:[A-Za-z0-9+\\/]{4})*(?:[A-Za-z0-9+\\/]' # nosemgrep
28 '{4}|[A-Za-z0-9+\\/]{3}=' # nosemgrep
29 '|[A-Za-z0-9+\\/]{2}={2}))' # nosemgrep
30 '\"(\\s+type="text/javascript")?' # nosemgrep
31 f'{y}', # nosemgrep
32 flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101
33 for x in ["octet-stream", "javascript"]
34 for y in ["\\s*/>", ">\\s*</script>"]],
35)
37#: the regexes for css
38__REGEXES_URI_CSS: Final[tuple[reg.Regex, ...]] = tuple(
39 [reg.compile( # nosemgrep
40 '<link rel=\"stylesheet\" ' # nosemgrep
41 f'href=\"data:application/{x}(;' # nosemgrep
42 '\\s*charset=utf-8)?;base64,((?:[A-Za-z0-9' # nosemgrep
43 '+\\/]{4})*(?:[A-Za-z' # nosemgrep
44 '0-9+\\/]{4}|[A-Za-z0-9+\\/]{3}=|' # nosemgrep
45 '[A-Za-z0-9+\\/]{2}={2}))\"' # nosemgrep
46 '(\\s+type="text/css")?' # nosemgrep
47 f'{y}', # nosemgrep
48 flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101
49 for x in ["octet-stream"]
50 for y in ["\\s*/>", ">\\s*</link>"]],
51)
54def __base64_unpacker(args, start: str, end: str) -> str:
55 """
56 Convert the base64 encoded text to normal text.
58 :param args: the arguments
59 :param start: the start tag
60 :param end: the end tag
61 :return: the text
62 """
63 decoded = base64.b64decode(str(args.groups()[1]).strip()).decode(UTF8)
64 res = f"{start}{decoded.strip()}{end}"
65 if len(res) <= (args.end() - args.start()):
66 return res
67 return str(args).strip()
70def __base64_unpacker_js(args) -> str:
71 """
72 Convert the base64 encoded javascript to normal text.
74 This does not seem to work?
76 :param args: the arguments
77 :return: the text
78 """
79 return __base64_unpacker(args, '<script type="text/javascript">',
80 "</script>")
83def __base64_unpacker_css(args) -> str:
84 """
85 Convert the base64 encoded css to normal text.
87 This does not seem to work?
89 :param args: the arguments
90 :return: the text
91 """
92 return __base64_unpacker(args, '<style type="text/css">', "</style>")
95def __unpack_data_uris(text: str) -> str:
96 """
97 Unpack all javascript data urls.
99 :param text: the original html text
100 :return: the text with all scripts expanded
101 """
102 for regex in __REGEXES_URI_JAVASCRIPT:
103 while True:
104 tn = reg.sub(regex, __base64_unpacker_js, text)
105 if tn is text:
106 break
107 text = tn
108 for regex in __REGEXES_URI_CSS:
109 while True:
110 tn = reg.sub(regex, __base64_unpacker_css, text)
111 if tn is text:
112 break
113 text = tn
114 return text
117# noinspection PyBroadException
118def html_postprocess(in_file: str,
119 out_file: str,
120 flatten_data_uris: bool = True,
121 fully_evaluate_html: bool = False,
122 purge_scripts: bool = False,
123 minify: bool = True,
124 purge_mathjax: bool = True,
125 canonicalize_ids: bool = True,
126 overwrite: bool = False) -> Path:
127 """
128 Post-process a html file.
130 :param in_file: the input file
131 :param out_file: the output file
132 :param flatten_data_uris: should we flatten data URIs?
133 :param fully_evaluate_html: should we use selenium to fully evaluate
134 all html and javascript?
135 :param purge_scripts: should we purge all javascripts from the file?
136 :param minify: should we minify the HTML output?
137 :param purge_mathjax: purge all mathjax stuff?
138 :param canonicalize_ids: should we canonicalize the IDs?
139 :param overwrite: should the output file be overwritten if it exists?
140 :return: the output file
141 """
142 source = Path.file(in_file)
143 output = Path.path(out_file)
144 logger(f"post-processing HTML file from '{source}' to '{output}'.")
145 if (not overwrite) and exists(output):
146 raise ValueError(f"Output file '{output}' already exists.")
147 if source == output:
148 raise ValueError(f"Input and output file is the same: '{source}'.")
150 current_file: Path = source
151 needs_file_out: bool = False
152 text: str = enforce_non_empty_str(source.read_all_str().strip())
154 with TempDir.create() as temp:
155 if flatten_data_uris: # flatten data uris
156 text_n = enforce_non_empty_str(__unpack_data_uris(text))
157 if text_n != text:
158 text = text_n
159 needs_file_out = True
160 logger("flattening the data uris changed the HTML content.")
161 else:
162 logger("flattening the data uris did not change the "
163 "HTML content.")
164 del text_n
166 if fully_evaluate_html: # flatten scripts and html
167 if has_tool(TOOL_FIREFOX_DRIVER) and has_tool(TOOL_FIREFOX):
168 options = webdriver.FirefoxOptions()
169 options.add_argument("--enable-javascript")
170 options.add_argument("-headless")
171 service = Service(log_path=os.path.devnull)
173 try:
174 browser = webdriver.Firefox(
175 options=options, service=service)
176 except BaseException:
177 options.binary_location = TOOL_FIREFOX
178 browser = webdriver.Firefox(
179 options=options, service=service)
181 if needs_file_out:
182 current_file = temp.resolve_inside("1.html")
183 current_file.write_all(text)
184 needs_file_out = False
185 current_file.enforce_file()
186 logger(f"invoking '{TOOL_FIREFOX_DRIVER}' via selenium on "
187 f"'{current_file}' to evaluate HTML.")
188 browser.get("file:///" + current_file)
189 browser.implicitly_wait(1)
190 html = browser.page_source
191 browser.quit()
192 html = html.strip()
193 if not html:
194 raise ValueError("Browser returned empty html.")
195 if not html.startswith("<!"):
196 html = "<!DOCTYPE HTML>" + html
197 if html != text:
198 needs_file_out = True
199 text = html
200 logger("html evaluation did change something.")
201 else:
202 logger("html evaluation changed nothing.")
203 del html
204 else:
205 logger(f"cannot use HTML evaluation, '{TOOL_FIREFOX}' or '"
206 f"{TOOL_FIREFOX_DRIVER}' not present.")
208 if minify or canonicalize_ids or purge_scripts: # minify output
209 ntext = enforce_non_empty_str(__html_crusher(
210 text, canonicalize_ids=canonicalize_ids,
211 purge_mathjax=purge_mathjax,
212 minify=minify,
213 purge_scripts=purge_scripts))
214 if ntext != text:
215 needs_file_out = True
216 text = ntext
217 logger("html minification has changed the content.")
218 else:
219 logger("html minification had no impact")
220 del ntext
222 if needs_file_out:
223 logger(f"writing post-processing result to '{output}'.")
224 output.write_all(text)
225 elif current_file == source:
226 logger(f"copying HTML from '{source}' to '{output}'.")
227 Path.copy_file(source, output)
228 else:
229 logger(f"moving HTML from '{current_file}' to '{output}'.")
230 move_pure(current_file, output)
232 output.enforce_file()
233 return output
236def __inner_minify(parsed: bs4.BeautifulSoup) -> None:
237 """
238 Execute the inner HTML minification routine.
240 This routine can be applied before and after ID normalization.
242 :param bs4.BeautifulSoup parsed: the tags to process
243 """
244 # try to discover and purge useless references
245 for tag in parsed("span"):
246 if "id" in tag.attrs:
247 tagid = tag.attrs["id"]
248 if tag.contents:
249 child = tag.contents[0]
250 if child.name == "a" and "href" in child.attrs:
251 ref = child.attrs["href"]
252 if ref.startswith("#") and (ref[1:] == tagid) \
253 and (not (child.contents or child.string)):
254 del child.attrs["href"]
256 # replace tags with their children if they have no attributes
257 # or other contents
258 for name in ["span", "div", "g"]:
259 for tag in reversed(list(parsed(name))):
260 if tag.contents and (len(tag.contents) == 1) and \
261 (not tag.string):
262 child = tag.contents[0]
263 if child.name == name:
264 if not tag.attrs:
265 tag.replace_with(child)
266 continue
267 if not child.attrs:
268 child.attrs = tag.attrs
269 tag.replace_with(child)
270 continue
271 if (list(tag.attrs.keys()) == ["id"]) \
272 ^ (list(child.attrs.keys()) == ["id"]):
273 child.attrs.update(tag.attrs)
274 tag.replace_with(child)
275 continue
278#: the useless spans pattern
279__USELESS_SPANS: Final[Pattern] = _compile(
280 r"<span>([a-zA-Z0-9 \t\n,;.:-_#'+~*^°!\"§$%&/()[]{}=?\\?`@|>]*?)</span>",
281 MULTILINE)
284def __html_crusher(text: str,
285 canonicalize_ids: bool = True,
286 purge_mathjax: bool = True,
287 minify: bool = True,
288 purge_scripts: bool = False) -> str:
289 """
290 Crush the html content.
292 :param text: the text coming in
293 :param canonicalize_ids: should we canonicalize the IDs?
294 :param purge_mathjax: purge all mathjax stuff?
295 :param minify: should we minify the HTML output?
296 :param purge_scripts: should we purge all javascripts?
297 :return: the crushed html text
298 """
299 parsed: bs4.BeautifulSoup = bs4.BeautifulSoup(text, "html.parser")
301 # remove the useless mathjax content
302 if purge_mathjax:
303 # delete useless mathml content
304 for tag in parsed("mjx-assistive-mml"):
305 tag.decompose()
307 # delete useless components of tags
308 for tag in parsed("use"):
309 if "data-c" in tag.attrs:
310 del tag.attrs["data-c"]
311 for tag in parsed("g"):
312 if "data-mml-node" in tag.attrs:
313 del tag.attrs["data-mml-node"]
314 if "data-mjx-texclass" in tag.attrs:
315 del tag.attrs["data-mjx-texclass"]
316 for tag in parsed("mjx-container"):
317 if "class" in tag.attrs:
318 clz = " ".join(tag.attrs["class"])
319 clzn = clz.replace(" CtxtMenu_Attached_0", "") \
320 .replace("CtxtMenu_Attached_0 ", "") \
321 .replace(" ", " ").strip()
322 if clzn != clz:
323 tag.attrs["class"] = clzn
324 if "ctxtmenu_counter" in tag.attrs:
325 del tag.attrs["ctxtmenu_counter"]
326 if "tabindex" in tag.attrs:
327 del tag.attrs["tabindex"]
329 # purge useless context menu styles
330 for tag in parsed("style"):
331 tagtext = tag.string
332 if ".CtxtMenu_" in tagtext:
333 tag.decompose()
334 continue
335 found = False
336 while True:
337 idx1 = tagtext.find("mjx-assistive-mml")
338 if idx1 < 0:
339 break
340 idx2 = tagtext.find("{", idx1)
341 if idx2 <= idx1:
342 break
343 idx3 = tagtext.find("}", idx2)
344 if idx3 <= idx2:
345 break
346 tagtext = tagtext[:idx1].strip() + \
347 tagtext[(idx3 + 1):].strip()
348 found = True
349 if found:
350 tag.string = tagtext
352 # purge all scripts
353 if purge_scripts:
354 for tag in parsed("script"):
355 tag.decompose()
357 if minify:
358 # merge all styles
359 styles = parsed("style")
360 if len(styles) > 1:
361 all_styles = "".join(tag.string.strip() for tag in styles)
362 for tag in styles[1:]:
363 tag.decompose()
364 styles[0].string = all_styles
365 # remove the generator meta data, as it is not needed
366 for tag in parsed("meta"):
367 if "name" in tag.attrs and tag.attrs["name"] == "generator":
368 tag.decompose()
370 __inner_minify(parsed)
372 # replace all ids with shorter ids
373 if canonicalize_ids:
374 # first, we try to minify the element IDs
375 id_counts: dict[str, int] = {}
376 # find all IDs
377 for ref in ["id", "name"]:
378 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):
379 a = tag.attrs[ref]
380 if len(a) <= 0:
381 del tag.attrs[ref]
382 continue
383 if (tag.name.lower() == "meta") and (ref == "name"):
384 continue
385 if a in id_counts:
386 raise ValueError(
387 f"id '{a}' in '{ref}' of tag '{tag}' appears twice!")
388 id_counts[a] = 0
389 # count the references to them
390 for ref in ["href", "xlink:href"]:
391 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):
392 a = tag.attrs[ref]
393 if a.startswith("#"):
394 a = a[1:].strip()
395 if a not in id_counts:
396 raise ValueError("Found reference to undefined id "
397 f"'{a}' of tag '{tag}'.")
398 id_counts[a] += 1
400 # purge all unreferenced ids
401 id_list = [(tid, count) for (tid, count) in id_counts.items()
402 if count > 0]
403 del id_counts
405 # create smaller IDs
406 id_list.sort(key=lambda x: -x[1])
407 ids: dict[str, str] = {}
408 cnt: int = 0
409 for idx in id_list:
410 ids[idx[0]] = __int2str(cnt)
411 cnt += 1
412 del id_list, cnt
414 # write back the ids
415 for ref in ["id", "name"]:
416 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):
417 if (tag.name.lower() == "meta") and (ref == "name"):
418 continue
419 tid = tag.attrs[ref]
420 if tid in ids:
421 tag.attrs[ref] = ids[tid]
422 else:
423 del tag.attrs[ref]
425 # re-link the references
426 for ref in ["href", "xlink:href"]:
427 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):
428 a = tag.attrs[ref]
429 if a.startswith("#"):
430 a = a[1:].strip()
431 if a not in ids:
432 raise ValueError(
433 f"Found reference to deleted id '{a}'.")
434 tag.attrs[ref] = f"#{ids[a]}"
436 # Since we have minified IDs, we may have purged useless IDs.
437 # Thus, maybe we can now purge additional tags.
438 if minify:
439 __inner_minify(parsed)
441 # convert the parsed html back to text and check if it is smaller
442 ntext = parsed.__unicode__()
443 if len(ntext) < len(text):
444 text = ntext
446 # apply the final minification step
447 if minify:
448 ntext = enforce_non_empty_str(
449 minify_html.minify( # pylint: disable=E1101
450 text, do_not_minify_doctype=True,
451 ensure_spec_compliant_unquoted_attribute_values=True,
452 remove_bangs=True,
453 remove_processing_instructions=True,
454 keep_html_and_head_opening_tags=True,
455 keep_spaces_between_attributes=True,
456 minify_css=True,
457 minify_js=True).strip())
458 if len(ntext) < len(text):
459 text = ntext
460 text = regex_sub(__USELESS_SPANS, "\\1", text)
462 return text
465#: the internal start digits that can be used for it to string conversation
466__DIGITS_START = string.ascii_letters
467#: the internal digits that can be used for it to string conversation
468__DIGITS = __DIGITS_START + string.digits + "-_"
471def __int2str(x: int) -> str:
472 """
473 Convert an integer to a string.
475 :param x: the integer
476 :return: the compact string
477 """
478 if x == 0:
479 return __DIGITS_START[0]
480 digits: list[str] = []
481 use_digits = __DIGITS_START
482 while x:
483 base = len(use_digits)
484 digits.append(use_digits[x % base])
485 x = x // base
486 use_digits = __DIGITS
487 return "".join(digits)