Coverage for bookbuilderpy/html.py: 40%

268 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-17 23:15 +0000

1"""Post-process HTML files.""" 

2import base64 

3import os 

4import string 

5from os.path import exists 

6from re import MULTILINE 

7from re import compile as _compile 

8from typing import Final, Pattern 

9 

10import bs4 # type: ignore 

11import minify_html # type: ignore 

12import regex as reg # type: ignore 

13from selenium import webdriver # type: ignore 

14from selenium.webdriver.firefox.service import Service 

15 

16from bookbuilderpy.logger import logger 

17from bookbuilderpy.path import UTF8, Path, move_pure 

18from bookbuilderpy.strings import enforce_non_empty_str, regex_sub 

19from bookbuilderpy.temp import TempDir 

20from bookbuilderpy.versions import TOOL_FIREFOX, TOOL_FIREFOX_DRIVER, has_tool 

21 

22#: the regexes for java script 

23__REGEXES_URI_JAVASCRIPT: Final[tuple[reg.Regex, ...]] = tuple( 

24 [reg.compile( # nosemgrep 

25 f'<script src=\"data:application/{x}' # nosemgrep 

26 '(;\\s*charset=utf-8)?;base64,' # nosemgrep 

27 '((?:[A-Za-z0-9+\\/]{4})*(?:[A-Za-z0-9+\\/]' # nosemgrep 

28 '{4}|[A-Za-z0-9+\\/]{3}=' # nosemgrep 

29 '|[A-Za-z0-9+\\/]{2}={2}))' # nosemgrep 

30 '\"(\\s+type="text/javascript")?' # nosemgrep 

31 f'{y}', # nosemgrep 

32 flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101 

33 for x in ["octet-stream", "javascript"] 

34 for y in ["\\s*/>", ">\\s*</script>"]], 

35) 

36 

37#: the regexes for css 

38__REGEXES_URI_CSS: Final[tuple[reg.Regex, ...]] = tuple( 

39 [reg.compile( # nosemgrep 

40 '<link rel=\"stylesheet\" ' # nosemgrep 

41 f'href=\"data:application/{x}(;' # nosemgrep 

42 '\\s*charset=utf-8)?;base64,((?:[A-Za-z0-9' # nosemgrep 

43 '+\\/]{4})*(?:[A-Za-z' # nosemgrep 

44 '0-9+\\/]{4}|[A-Za-z0-9+\\/]{3}=|' # nosemgrep 

45 '[A-Za-z0-9+\\/]{2}={2}))\"' # nosemgrep 

46 '(\\s+type="text/css")?' # nosemgrep 

47 f'{y}', # nosemgrep 

48 flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101 

49 for x in ["octet-stream"] 

50 for y in ["\\s*/>", ">\\s*</link>"]], 

51) 

52 

53 

54def __base64_unpacker(args, start: str, end: str) -> str: 

55 """ 

56 Convert the base64 encoded text to normal text. 

57 

58 :param args: the arguments 

59 :param start: the start tag 

60 :param end: the end tag 

61 :return: the text 

62 """ 

63 decoded = base64.b64decode(str(args.groups()[1]).strip()).decode(UTF8) 

64 res = f"{start}{decoded.strip()}{end}" 

65 if len(res) <= (args.end() - args.start()): 

66 return res 

67 return str(args).strip() 

68 

69 

70def __base64_unpacker_js(args) -> str: 

71 """ 

72 Convert the base64 encoded javascript to normal text. 

73 

74 This does not seem to work? 

75 

76 :param args: the arguments 

77 :return: the text 

78 """ 

79 return __base64_unpacker(args, '<script type="text/javascript">', 

80 "</script>") 

81 

82 

83def __base64_unpacker_css(args) -> str: 

84 """ 

85 Convert the base64 encoded css to normal text. 

86 

87 This does not seem to work? 

88 

89 :param args: the arguments 

90 :return: the text 

91 """ 

92 return __base64_unpacker(args, '<style type="text/css">', "</style>") 

93 

94 

95def __unpack_data_uris(text: str) -> str: 

96 """ 

97 Unpack all javascript data urls. 

98 

99 :param text: the original html text 

100 :return: the text with all scripts expanded 

101 """ 

102 for regex in __REGEXES_URI_JAVASCRIPT: 

103 while True: 

104 tn = reg.sub(regex, __base64_unpacker_js, text) 

105 if tn is text: 

106 break 

107 text = tn 

108 for regex in __REGEXES_URI_CSS: 

109 while True: 

110 tn = reg.sub(regex, __base64_unpacker_css, text) 

111 if tn is text: 

112 break 

113 text = tn 

114 return text 

115 

116 

117# noinspection PyBroadException 

118def html_postprocess(in_file: str, 

119 out_file: str, 

120 flatten_data_uris: bool = True, 

121 fully_evaluate_html: bool = False, 

122 purge_scripts: bool = False, 

123 minify: bool = True, 

124 purge_mathjax: bool = True, 

125 canonicalize_ids: bool = True, 

126 overwrite: bool = False) -> Path: 

127 """ 

128 Post-process a html file. 

129 

130 :param in_file: the input file 

131 :param out_file: the output file 

132 :param flatten_data_uris: should we flatten data URIs? 

133 :param fully_evaluate_html: should we use selenium to fully evaluate 

134 all html and javascript? 

135 :param purge_scripts: should we purge all javascripts from the file? 

136 :param minify: should we minify the HTML output? 

137 :param purge_mathjax: purge all mathjax stuff? 

138 :param canonicalize_ids: should we canonicalize the IDs? 

139 :param overwrite: should the output file be overwritten if it exists? 

140 :return: the output file 

141 """ 

142 source = Path.file(in_file) 

143 output = Path.path(out_file) 

144 logger(f"post-processing HTML file from '{source}' to '{output}'.") 

145 if (not overwrite) and exists(output): 

146 raise ValueError(f"Output file '{output}' already exists.") 

147 if source == output: 

148 raise ValueError(f"Input and output file is the same: '{source}'.") 

149 

150 current_file: Path = source 

151 needs_file_out: bool = False 

152 text: str = enforce_non_empty_str(source.read_all_str().strip()) 

153 

154 with TempDir.create() as temp: 

155 if flatten_data_uris: # flatten data uris 

156 text_n = enforce_non_empty_str(__unpack_data_uris(text)) 

157 if text_n != text: 

158 text = text_n 

159 needs_file_out = True 

160 logger("flattening the data uris changed the HTML content.") 

161 else: 

162 logger("flattening the data uris did not change the " 

163 "HTML content.") 

164 del text_n 

165 

166 if fully_evaluate_html: # flatten scripts and html 

167 if has_tool(TOOL_FIREFOX_DRIVER) and has_tool(TOOL_FIREFOX): 

168 options = webdriver.FirefoxOptions() 

169 options.add_argument("--enable-javascript") 

170 options.add_argument("-headless") 

171 service = Service(log_path=os.path.devnull) 

172 

173 try: 

174 browser = webdriver.Firefox( 

175 options=options, service=service) 

176 except BaseException: 

177 options.binary_location = TOOL_FIREFOX 

178 browser = webdriver.Firefox( 

179 options=options, service=service) 

180 

181 if needs_file_out: 

182 current_file = temp.resolve_inside("1.html") 

183 current_file.write_all(text) 

184 needs_file_out = False 

185 current_file.enforce_file() 

186 logger(f"invoking '{TOOL_FIREFOX_DRIVER}' via selenium on " 

187 f"'{current_file}' to evaluate HTML.") 

188 browser.get("file:///" + current_file) 

189 browser.implicitly_wait(1) 

190 html = browser.page_source 

191 browser.quit() 

192 html = html.strip() 

193 if not html: 

194 raise ValueError("Browser returned empty html.") 

195 if not html.startswith("<!"): 

196 html = "<!DOCTYPE HTML>" + html 

197 if html != text: 

198 needs_file_out = True 

199 text = html 

200 logger("html evaluation did change something.") 

201 else: 

202 logger("html evaluation changed nothing.") 

203 del html 

204 else: 

205 logger(f"cannot use HTML evaluation, '{TOOL_FIREFOX}' or '" 

206 f"{TOOL_FIREFOX_DRIVER}' not present.") 

207 

208 if minify or canonicalize_ids or purge_scripts: # minify output 

209 ntext = enforce_non_empty_str(__html_crusher( 

210 text, canonicalize_ids=canonicalize_ids, 

211 purge_mathjax=purge_mathjax, 

212 minify=minify, 

213 purge_scripts=purge_scripts)) 

214 if ntext != text: 

215 needs_file_out = True 

216 text = ntext 

217 logger("html minification has changed the content.") 

218 else: 

219 logger("html minification had no impact") 

220 del ntext 

221 

222 if needs_file_out: 

223 logger(f"writing post-processing result to '{output}'.") 

224 output.write_all(text) 

225 elif current_file == source: 

226 logger(f"copying HTML from '{source}' to '{output}'.") 

227 Path.copy_file(source, output) 

228 else: 

229 logger(f"moving HTML from '{current_file}' to '{output}'.") 

230 move_pure(current_file, output) 

231 

232 output.enforce_file() 

233 return output 

234 

235 

236def __inner_minify(parsed: bs4.BeautifulSoup) -> None: 

237 """ 

238 Execute the inner HTML minification routine. 

239 

240 This routine can be applied before and after ID normalization. 

241 

242 :param bs4.BeautifulSoup parsed: the tags to process 

243 """ 

244 # try to discover and purge useless references 

245 for tag in parsed("span"): 

246 if "id" in tag.attrs: 

247 tagid = tag.attrs["id"] 

248 if tag.contents: 

249 child = tag.contents[0] 

250 if child.name == "a" and "href" in child.attrs: 

251 ref = child.attrs["href"] 

252 if ref.startswith("#") and (ref[1:] == tagid) \ 

253 and (not (child.contents or child.string)): 

254 del child.attrs["href"] 

255 

256 # replace tags with their children if they have no attributes 

257 # or other contents 

258 for name in ["span", "div", "g"]: 

259 for tag in reversed(list(parsed(name))): 

260 if tag.contents and (len(tag.contents) == 1) and \ 

261 (not tag.string): 

262 child = tag.contents[0] 

263 if child.name == name: 

264 if not tag.attrs: 

265 tag.replace_with(child) 

266 continue 

267 if not child.attrs: 

268 child.attrs = tag.attrs 

269 tag.replace_with(child) 

270 continue 

271 if (list(tag.attrs.keys()) == ["id"]) \ 

272 ^ (list(child.attrs.keys()) == ["id"]): 

273 child.attrs.update(tag.attrs) 

274 tag.replace_with(child) 

275 continue 

276 

277 

278#: the useless spans pattern 

279__USELESS_SPANS: Final[Pattern] = _compile( 

280 r"<span>([a-zA-Z0-9 \t\n,;.:-_#'+~*^°!\"§$%&/()[]{}=?\\?`@|>]*?)</span>", 

281 MULTILINE) 

282 

283 

284def __html_crusher(text: str, 

285 canonicalize_ids: bool = True, 

286 purge_mathjax: bool = True, 

287 minify: bool = True, 

288 purge_scripts: bool = False) -> str: 

289 """ 

290 Crush the html content. 

291 

292 :param text: the text coming in 

293 :param canonicalize_ids: should we canonicalize the IDs? 

294 :param purge_mathjax: purge all mathjax stuff? 

295 :param minify: should we minify the HTML output? 

296 :param purge_scripts: should we purge all javascripts? 

297 :return: the crushed html text 

298 """ 

299 parsed: bs4.BeautifulSoup = bs4.BeautifulSoup(text, "html.parser") 

300 

301 # remove the useless mathjax content 

302 if purge_mathjax: 

303 # delete useless mathml content 

304 for tag in parsed("mjx-assistive-mml"): 

305 tag.decompose() 

306 

307 # delete useless components of tags 

308 for tag in parsed("use"): 

309 if "data-c" in tag.attrs: 

310 del tag.attrs["data-c"] 

311 for tag in parsed("g"): 

312 if "data-mml-node" in tag.attrs: 

313 del tag.attrs["data-mml-node"] 

314 if "data-mjx-texclass" in tag.attrs: 

315 del tag.attrs["data-mjx-texclass"] 

316 for tag in parsed("mjx-container"): 

317 if "class" in tag.attrs: 

318 clz = " ".join(tag.attrs["class"]) 

319 clzn = clz.replace(" CtxtMenu_Attached_0", "") \ 

320 .replace("CtxtMenu_Attached_0 ", "") \ 

321 .replace(" ", " ").strip() 

322 if clzn != clz: 

323 tag.attrs["class"] = clzn 

324 if "ctxtmenu_counter" in tag.attrs: 

325 del tag.attrs["ctxtmenu_counter"] 

326 if "tabindex" in tag.attrs: 

327 del tag.attrs["tabindex"] 

328 

329 # purge useless context menu styles 

330 for tag in parsed("style"): 

331 tagtext = tag.string 

332 if ".CtxtMenu_" in tagtext: 

333 tag.decompose() 

334 continue 

335 found = False 

336 while True: 

337 idx1 = tagtext.find("mjx-assistive-mml") 

338 if idx1 < 0: 

339 break 

340 idx2 = tagtext.find("{", idx1) 

341 if idx2 <= idx1: 

342 break 

343 idx3 = tagtext.find("}", idx2) 

344 if idx3 <= idx2: 

345 break 

346 tagtext = tagtext[:idx1].strip() + \ 

347 tagtext[(idx3 + 1):].strip() 

348 found = True 

349 if found: 

350 tag.string = tagtext 

351 

352 # purge all scripts 

353 if purge_scripts: 

354 for tag in parsed("script"): 

355 tag.decompose() 

356 

357 if minify: 

358 # merge all styles 

359 styles = parsed("style") 

360 if len(styles) > 1: 

361 all_styles = "".join(tag.string.strip() for tag in styles) 

362 for tag in styles[1:]: 

363 tag.decompose() 

364 styles[0].string = all_styles 

365 # remove the generator meta data, as it is not needed 

366 for tag in parsed("meta"): 

367 if "name" in tag.attrs and tag.attrs["name"] == "generator": 

368 tag.decompose() 

369 

370 __inner_minify(parsed) 

371 

372 # replace all ids with shorter ids 

373 if canonicalize_ids: 

374 # first, we try to minify the element IDs 

375 id_counts: dict[str, int] = {} 

376 # find all IDs 

377 for ref in ["id", "name"]: 

378 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs): 

379 a = tag.attrs[ref] 

380 if len(a) <= 0: 

381 del tag.attrs[ref] 

382 continue 

383 if (tag.name.lower() == "meta") and (ref == "name"): 

384 continue 

385 if a in id_counts: 

386 raise ValueError( 

387 f"id '{a}' in '{ref}' of tag '{tag}' appears twice!") 

388 id_counts[a] = 0 

389 # count the references to them 

390 for ref in ["href", "xlink:href"]: 

391 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs): 

392 a = tag.attrs[ref] 

393 if a.startswith("#"): 

394 a = a[1:].strip() 

395 if a not in id_counts: 

396 raise ValueError("Found reference to undefined id " 

397 f"'{a}' of tag '{tag}'.") 

398 id_counts[a] += 1 

399 

400 # purge all unreferenced ids 

401 id_list = [(tid, count) for (tid, count) in id_counts.items() 

402 if count > 0] 

403 del id_counts 

404 

405 # create smaller IDs 

406 id_list.sort(key=lambda x: -x[1]) 

407 ids: dict[str, str] = {} 

408 cnt: int = 0 

409 for idx in id_list: 

410 ids[idx[0]] = __int2str(cnt) 

411 cnt += 1 

412 del id_list, cnt 

413 

414 # write back the ids 

415 for ref in ["id", "name"]: 

416 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs): 

417 if (tag.name.lower() == "meta") and (ref == "name"): 

418 continue 

419 tid = tag.attrs[ref] 

420 if tid in ids: 

421 tag.attrs[ref] = ids[tid] 

422 else: 

423 del tag.attrs[ref] 

424 

425 # re-link the references 

426 for ref in ["href", "xlink:href"]: 

427 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs): 

428 a = tag.attrs[ref] 

429 if a.startswith("#"): 

430 a = a[1:].strip() 

431 if a not in ids: 

432 raise ValueError( 

433 f"Found reference to deleted id '{a}'.") 

434 tag.attrs[ref] = f"#{ids[a]}" 

435 

436 # Since we have minified IDs, we may have purged useless IDs. 

437 # Thus, maybe we can now purge additional tags. 

438 if minify: 

439 __inner_minify(parsed) 

440 

441 # convert the parsed html back to text and check if it is smaller 

442 ntext = parsed.__unicode__() 

443 if len(ntext) < len(text): 

444 text = ntext 

445 

446 # apply the final minification step 

447 if minify: 

448 ntext = enforce_non_empty_str( 

449 minify_html.minify( # pylint: disable=E1101 

450 text, do_not_minify_doctype=True, 

451 ensure_spec_compliant_unquoted_attribute_values=True, 

452 remove_bangs=True, 

453 remove_processing_instructions=True, 

454 keep_html_and_head_opening_tags=True, 

455 keep_spaces_between_attributes=True, 

456 minify_css=True, 

457 minify_js=True).strip()) 

458 if len(ntext) < len(text): 

459 text = ntext 

460 text = regex_sub(__USELESS_SPANS, "\\1", text) 

461 

462 return text 

463 

464 

465#: the internal start digits that can be used for it to string conversation 

466__DIGITS_START = string.ascii_letters 

467#: the internal digits that can be used for it to string conversation 

468__DIGITS = __DIGITS_START + string.digits + "-_" 

469 

470 

471def __int2str(x: int) -> str: 

472 """ 

473 Convert an integer to a string. 

474 

475 :param x: the integer 

476 :return: the compact string 

477 """ 

478 if x == 0: 

479 return __DIGITS_START[0] 

480 digits: list[str] = [] 

481 use_digits = __DIGITS_START 

482 while x: 

483 base = len(use_digits) 

484 digits.append(use_digits[x % base]) 

485 x = x // base 

486 use_digits = __DIGITS 

487 return "".join(digits)