Coverage for bookbuilderpy/html.py: 40%

1"""Post-process HTML files."""

2import base64

3import os

4import string

5from os.path import exists

6from re import MULTILINE

7from re import compile as _compile

8from typing import Final, Pattern

10import bs4 # type: ignore

11import minify_html # type: ignore

12import regex as reg # type: ignore

13from selenium import webdriver # type: ignore

14from selenium.webdriver.firefox.service import Service

16from bookbuilderpy.logger import logger

17from bookbuilderpy.path import UTF8, Path, move_pure

18from bookbuilderpy.strings import enforce_non_empty_str, regex_sub

19from bookbuilderpy.temp import TempDir

20from bookbuilderpy.versions import TOOL_FIREFOX, TOOL_FIREFOX_DRIVER, has_tool

22#: the regexes for java script

23__REGEXES_URI_JAVASCRIPT: Final[tuple[reg.Regex, ...]] = tuple(

24 [reg.compile( # nosemgrep

25 f'<script src=\"data:application/{x}' # nosemgrep

26 '(;\\s*charset=utf-8)?;base64,' # nosemgrep

27 '((?:[A-Za-z0-9+\\/]{4})*(?:[A-Za-z0-9+\\/]' # nosemgrep

28 '{4}|[A-Za-z0-9+\\/]{3}=' # nosemgrep

29 '|[A-Za-z0-9+\\/]{2}={2}))' # nosemgrep

30 '\"(\\s+type="text/javascript")?' # nosemgrep

31 f'{y}', # nosemgrep

32 flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101

33 for x in ["octet-stream", "javascript"]

34 for y in ["\\s*/>", ">\\s*</script>"]],

35)

37#: the regexes for css

38__REGEXES_URI_CSS: Final[tuple[reg.Regex, ...]] = tuple(

39 [reg.compile( # nosemgrep

40 '<link rel=\"stylesheet\" ' # nosemgrep

41 f'href=\"data:application/{x}(;' # nosemgrep

42 '\\s*charset=utf-8)?;base64,((?:[A-Za-z0-9' # nosemgrep

43 '+\\/]{4})*(?:[A-Za-z' # nosemgrep

44 '0-9+\\/]{4}|[A-Za-z0-9+\\/]{3}=|' # nosemgrep

45 '[A-Za-z0-9+\\/]{2}={2}))\"' # nosemgrep

46 '(\\s+type="text/css")?' # nosemgrep

47 f'{y}', # nosemgrep

48 flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101

49 for x in ["octet-stream"]

50 for y in ["\\s*/>", ">\\s*</link>"]],

51)

54def __base64_unpacker(args, start: str, end: str) -> str:

55 """

56 Convert the base64 encoded text to normal text.

58 :param args: the arguments

59 :param start: the start tag

60 :param end: the end tag

61 :return: the text

62 """

63 decoded = base64.b64decode(str(args.groups()[1]).strip()).decode(UTF8)

64 res = f"{start}{decoded.strip()}{end}"

65 if len(res) <= (args.end() - args.start()):

66 return res

67 return str(args).strip()

70def __base64_unpacker_js(args) -> str:

71 """

72 Convert the base64 encoded javascript to normal text.

74 This does not seem to work?

76 :param args: the arguments

77 :return: the text

78 """

79 return __base64_unpacker(args, '<script type="text/javascript">',

80 "</script>")

83def __base64_unpacker_css(args) -> str:

84 """

85 Convert the base64 encoded css to normal text.

87 This does not seem to work?

89 :param args: the arguments

90 :return: the text

91 """

92 return __base64_unpacker(args, '<style type="text/css">', "</style>")

95def __unpack_data_uris(text: str) -> str:

96 """

97 Unpack all javascript data urls.

99 :param text: the original html text

100 :return: the text with all scripts expanded

101 """

102 for regex in __REGEXES_URI_JAVASCRIPT:

103 while True:

104 tn = reg.sub(regex, __base64_unpacker_js, text)

105 if tn is text:

106 break

107 text = tn

108 for regex in __REGEXES_URI_CSS:

109 while True:

110 tn = reg.sub(regex, __base64_unpacker_css, text)

111 if tn is text:

112 break

113 text = tn

114 return text

115

116

117# noinspection PyBroadException

118def html_postprocess(in_file: str,

119 out_file: str,

120 flatten_data_uris: bool = True,

121 fully_evaluate_html: bool = False,

122 purge_scripts: bool = False,

123 minify: bool = True,

124 purge_mathjax: bool = True,

125 canonicalize_ids: bool = True,

126 overwrite: bool = False) -> Path:

127 """

128 Post-process a html file.

129

130 :param in_file: the input file

131 :param out_file: the output file

132 :param flatten_data_uris: should we flatten data URIs?

133 :param fully_evaluate_html: should we use selenium to fully evaluate

134 all html and javascript?

135 :param purge_scripts: should we purge all javascripts from the file?

136 :param minify: should we minify the HTML output?

137 :param purge_mathjax: purge all mathjax stuff?

138 :param canonicalize_ids: should we canonicalize the IDs?

139 :param overwrite: should the output file be overwritten if it exists?

140 :return: the output file

141 """

142 source = Path.file(in_file)

143 output = Path.path(out_file)

144 logger(f"post-processing HTML file from '{source}' to '{output}'.")

145 if (not overwrite) and exists(output):

146 raise ValueError(f"Output file '{output}' already exists.")

147 if source == output:

148 raise ValueError(f"Input and output file is the same: '{source}'.")

149

150 current_file: Path = source

151 needs_file_out: bool = False

152 text: str = enforce_non_empty_str(source.read_all_str().strip())

153

154 with TempDir.create() as temp:

155 if flatten_data_uris: # flatten data uris

156 text_n = enforce_non_empty_str(__unpack_data_uris(text))

157 if text_n != text:

158 text = text_n

159 needs_file_out = True

160 logger("flattening the data uris changed the HTML content.")

161 else:

162 logger("flattening the data uris did not change the "

163 "HTML content.")

164 del text_n

165

166 if fully_evaluate_html: # flatten scripts and html

167 if has_tool(TOOL_FIREFOX_DRIVER) and has_tool(TOOL_FIREFOX):

168 options = webdriver.FirefoxOptions()

169 options.add_argument("--enable-javascript")

170 options.add_argument("-headless")

171 service = Service(log_path=os.path.devnull)

172

173 try:

174 browser = webdriver.Firefox(

175 options=options, service=service)

176 except BaseException:

177 options.binary_location = TOOL_FIREFOX

178 browser = webdriver.Firefox(

179 options=options, service=service)

180

181 if needs_file_out:

182 current_file = temp.resolve_inside("1.html")

183 current_file.write_all(text)

184 needs_file_out = False

185 current_file.enforce_file()

186 logger(f"invoking '{TOOL_FIREFOX_DRIVER}' via selenium on "

187 f"'{current_file}' to evaluate HTML.")

188 browser.get("file:///" + current_file)

189 browser.implicitly_wait(1)

190 html = browser.page_source

191 browser.quit()

192 html = html.strip()

193 if not html:

194 raise ValueError("Browser returned empty html.")

195 if not html.startswith("<!"):

196 html = "<!DOCTYPE HTML>" + html

197 if html != text:

198 needs_file_out = True

199 text = html

200 logger("html evaluation did change something.")

201 else:

202 logger("html evaluation changed nothing.")

203 del html

204 else:

205 logger(f"cannot use HTML evaluation, '{TOOL_FIREFOX}' or '"

206 f"{TOOL_FIREFOX_DRIVER}' not present.")

207

208 if minify or canonicalize_ids or purge_scripts: # minify output

209 ntext = enforce_non_empty_str(__html_crusher(

210 text, canonicalize_ids=canonicalize_ids,

211 purge_mathjax=purge_mathjax,

212 minify=minify,

213 purge_scripts=purge_scripts))

214 if ntext != text:

215 needs_file_out = True

216 text = ntext

217 logger("html minification has changed the content.")

218 else:

219 logger("html minification had no impact")

220 del ntext

221

222 if needs_file_out:

223 logger(f"writing post-processing result to '{output}'.")

224 output.write_all(text)

225 elif current_file == source:

226 logger(f"copying HTML from '{source}' to '{output}'.")

227 Path.copy_file(source, output)

228 else:

229 logger(f"moving HTML from '{current_file}' to '{output}'.")

230 move_pure(current_file, output)

231

232 output.enforce_file()

233 return output

234

235

236def __inner_minify(parsed: bs4.BeautifulSoup) -> None:

237 """

238 Execute the inner HTML minification routine.

239

240 This routine can be applied before and after ID normalization.

241

242 :param bs4.BeautifulSoup parsed: the tags to process

243 """

244 # try to discover and purge useless references

245 for tag in parsed("span"):

246 if "id" in tag.attrs:

247 tagid = tag.attrs["id"]

248 if tag.contents:

249 child = tag.contents[0]

250 if child.name == "a" and "href" in child.attrs:

251 ref = child.attrs["href"]

252 if ref.startswith("#") and (ref[1:] == tagid) \

253 and (not (child.contents or child.string)):

254 del child.attrs["href"]

255

256 # replace tags with their children if they have no attributes

257 # or other contents

258 for name in ["span", "div", "g"]:

259 for tag in reversed(list(parsed(name))):

260 if tag.contents and (len(tag.contents) == 1) and \

261 (not tag.string):

262 child = tag.contents[0]

263 if child.name == name:

264 if not tag.attrs:

265 tag.replace_with(child)

266 continue

267 if not child.attrs:

268 child.attrs = tag.attrs

269 tag.replace_with(child)

270 continue

271 if (list(tag.attrs.keys()) == ["id"]) \

272 ^ (list(child.attrs.keys()) == ["id"]):

273 child.attrs.update(tag.attrs)

274 tag.replace_with(child)

275 continue

276

277

278#: the useless spans pattern

279__USELESS_SPANS: Final[Pattern] = _compile(

280 r"<span>([a-zA-Z0-9 \t\n,;.:-_#'+~*^°!\"§$%&/()[]{}=?\\?`@|>]*?)</span>",

281 MULTILINE)

282

283

284def __html_crusher(text: str,

285 canonicalize_ids: bool = True,

286 purge_mathjax: bool = True,

287 minify: bool = True,

288 purge_scripts: bool = False) -> str:

289 """

290 Crush the html content.

291

292 :param text: the text coming in

293 :param canonicalize_ids: should we canonicalize the IDs?

294 :param purge_mathjax: purge all mathjax stuff?

295 :param minify: should we minify the HTML output?

296 :param purge_scripts: should we purge all javascripts?

297 :return: the crushed html text

298 """

299 parsed: bs4.BeautifulSoup = bs4.BeautifulSoup(text, "html.parser")

300

301 # remove the useless mathjax content

302 if purge_mathjax:

303 # delete useless mathml content

304 for tag in parsed("mjx-assistive-mml"):

305 tag.decompose()

306

307 # delete useless components of tags

308 for tag in parsed("use"):

309 if "data-c" in tag.attrs:

310 del tag.attrs["data-c"]

311 for tag in parsed("g"):

312 if "data-mml-node" in tag.attrs:

313 del tag.attrs["data-mml-node"]

314 if "data-mjx-texclass" in tag.attrs:

315 del tag.attrs["data-mjx-texclass"]

316 for tag in parsed("mjx-container"):

317 if "class" in tag.attrs:

318 clz = " ".join(tag.attrs["class"])

319 clzn = clz.replace(" CtxtMenu_Attached_0", "") \

320 .replace("CtxtMenu_Attached_0 ", "") \

321 .replace(" ", " ").strip()

322 if clzn != clz:

323 tag.attrs["class"] = clzn

324 if "ctxtmenu_counter" in tag.attrs:

325 del tag.attrs["ctxtmenu_counter"]

326 if "tabindex" in tag.attrs:

327 del tag.attrs["tabindex"]

328

329 # purge useless context menu styles

330 for tag in parsed("style"):

331 tagtext = tag.string

332 if ".CtxtMenu_" in tagtext:

333 tag.decompose()

334 continue

335 found = False

336 while True:

337 idx1 = tagtext.find("mjx-assistive-mml")

338 if idx1 < 0:

339 break

340 idx2 = tagtext.find("{", idx1)

341 if idx2 <= idx1:

342 break

343 idx3 = tagtext.find("}", idx2)

344 if idx3 <= idx2:

345 break

346 tagtext = tagtext[:idx1].strip() + \

347 tagtext[(idx3 + 1):].strip()

348 found = True

349 if found:

350 tag.string = tagtext

351

352 # purge all scripts

353 if purge_scripts:

354 for tag in parsed("script"):

355 tag.decompose()

356

357 if minify:

358 # merge all styles

359 styles = parsed("style")

360 if len(styles) > 1:

361 all_styles = "".join(tag.string.strip() for tag in styles)

362 for tag in styles[1:]:

363 tag.decompose()

364 styles[0].string = all_styles

365 # remove the generator meta data, as it is not needed

366 for tag in parsed("meta"):

367 if "name" in tag.attrs and tag.attrs["name"] == "generator":

368 tag.decompose()

369

370 __inner_minify(parsed)

371

372 # replace all ids with shorter ids

373 if canonicalize_ids:

374 # first, we try to minify the element IDs

375 id_counts: dict[str, int] = {}

376 # find all IDs

377 for ref in ["id", "name"]:

378 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):

379 a = tag.attrs[ref]

380 if len(a) <= 0:

381 del tag.attrs[ref]

382 continue

383 if (tag.name.lower() == "meta") and (ref == "name"):

384 continue

385 if a in id_counts:

386 raise ValueError(

387 f"id '{a}' in '{ref}' of tag '{tag}' appears twice!")

388 id_counts[a] = 0

389 # count the references to them

390 for ref in ["href", "xlink:href"]:

391 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):

392 a = tag.attrs[ref]

393 if a.startswith("#"):

394 a = a[1:].strip()

395 if a not in id_counts:

396 raise ValueError("Found reference to undefined id "

397 f"'{a}' of tag '{tag}'.")

398 id_counts[a] += 1

399

400 # purge all unreferenced ids

401 id_list = [(tid, count) for (tid, count) in id_counts.items()

402 if count > 0]

403 del id_counts

404

405 # create smaller IDs

406 id_list.sort(key=lambda x: -x[1])

407 ids: dict[str, str] = {}

408 cnt: int = 0

409 for idx in id_list:

410 ids[idx[0]] = __int2str(cnt)

411 cnt += 1

412 del id_list, cnt

413

414 # write back the ids

415 for ref in ["id", "name"]:

416 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):

417 if (tag.name.lower() == "meta") and (ref == "name"):

418 continue

419 tid = tag.attrs[ref]

420 if tid in ids:

421 tag.attrs[ref] = ids[tid]

422 else:

423 del tag.attrs[ref]

424

425 # re-link the references

426 for ref in ["href", "xlink:href"]:

427 for tag in parsed.findAll(lambda tg, rr=ref: rr in tg.attrs):

428 a = tag.attrs[ref]

429 if a.startswith("#"):

430 a = a[1:].strip()

431 if a not in ids:

432 raise ValueError(

433 f"Found reference to deleted id '{a}'.")

434 tag.attrs[ref] = f"#{ids[a]}"

435

436 # Since we have minified IDs, we may have purged useless IDs.

437 # Thus, maybe we can now purge additional tags.

438 if minify:

439 __inner_minify(parsed)

440

441 # convert the parsed html back to text and check if it is smaller

442 ntext = parsed.__unicode__()

443 if len(ntext) < len(text):

444 text = ntext

445

446 # apply the final minification step

447 if minify:

448 ntext = enforce_non_empty_str(

449 minify_html.minify( # pylint: disable=E1101

450 text, do_not_minify_doctype=True,

451 ensure_spec_compliant_unquoted_attribute_values=True,

452 remove_bangs=True,

453 remove_processing_instructions=True,

454 keep_html_and_head_opening_tags=True,

455 keep_spaces_between_attributes=True,

456 minify_css=True,

457 minify_js=True).strip())

458 if len(ntext) < len(text):

459 text = ntext

460 text = regex_sub(__USELESS_SPANS, "\\1", text)

461

462 return text

463

464

465#: the internal start digits that can be used for it to string conversation

466__DIGITS_START = string.ascii_letters

467#: the internal digits that can be used for it to string conversation

468__DIGITS = __DIGITS_START + string.digits + "-_"

469

470

471def __int2str(x: int) -> str:

472 """

473 Convert an integer to a string.

474

475 :param x: the integer

476 :return: the compact string

477 """

478 if x == 0:

479 return __DIGITS_START[0]

480 digits: list[str] = []

481 use_digits = __DIGITS_START

482 while x:

483 base = len(use_digits)

484 digits.append(use_digits[x % base])

485 x = x // base

486 use_digits = __DIGITS

487 return "".join(digits)