Coverage for bookbuilderpy/pandoc.py: 14%

148 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-17 23:15 +0000

1"""A routine for invoking pandoc.""" 

2 

3import os.path 

4from typing import Callable, Final 

5 

6import bookbuilderpy.constants as bc 

7from bookbuilderpy.build_result import File 

8from bookbuilderpy.html import html_postprocess 

9from bookbuilderpy.logger import logger 

10from bookbuilderpy.path import Path 

11from bookbuilderpy.pdf import pdf_postprocess 

12from bookbuilderpy.resources import ResourceServer 

13from bookbuilderpy.shell import shell 

14from bookbuilderpy.strings import ( 

15 enforce_non_empty_str, 

16 enforce_non_empty_str_without_ws, 

17 regex_sub, 

18) 

19from bookbuilderpy.temp import TempFile 

20from bookbuilderpy.versions import TOOL_PANDOC, has_tool 

21 

22#: The meanings of the pandoc exit codes. 

23__EXIT_CODES: dict[int, str] = { 

24 3: "PandocFailOnWarningError", 

25 4: "PandocAppError", 

26 5: "PandocTemplateError", 

27 6: "PandocOptionError", 

28 21: "PandocUnknownReaderError", 

29 22: "PandocUnknownWriterError", 

30 23: "PandocUnsupportedExtensionError", 

31 24: "PandocCiteprocError", 

32 31: "PandocEpubSubdirectoryError", 

33 43: "PandocPDFError", 

34 44: "PandocXMLError", 

35 47: "PandocPDFProgramNotFoundError", 

36 61: "PandocHttpError", 

37 62: "PandocShouldNeverHappenError", 

38 63: "PandocSomeError", 

39 64: "PandocParseError", 

40 65: "PandocParsecError", 

41 66: "PandocMakePDFError", 

42 67: "PandocSyntaxMapError", 

43 83: "PandocFilterError", 

44 91: "PandocMacroLoop", 

45 92: "PandocUTF8DecodingError", 

46 93: "PandocIpynbDecodingError", 

47 94: "PandocUnsupportedCharsetError", 

48 97: "PandocCouldNotFindDataFileError", 

49 99: "PandocResourceNotFound", 

50} 

51 

52 

53def __pandoc_check_stderr(stderr: str) -> BaseException | None: 

54 """ 

55 Check the standard error output of pandoc. 

56 

57 :param stderr: the standard error string 

58 """ 

59 if stderr is None: 

60 return None 

61 if "Undefined cross-reference" in stderr: 

62 return ValueError("Undefined cross-reference!") 

63 if "[WARNING] Citeproc" in stderr: 

64 return ValueError("Undefined citation!") 

65 return None 

66 

67 

68def pandoc(source_file: str, 

69 dest_file: str, 

70 format_in: str = bc.PANDOC_FORMAT_MARKDOWN, 

71 format_out: str = bc.PANDOC_FORMAT_LATEX, 

72 locale: str | None = None, 

73 standalone: bool = True, 

74 tabstops: int | None = 2, 

75 toc_print: bool = True, 

76 toc_depth: int = 3, 

77 crossref: bool = True, 

78 bibliography: bool = True, 

79 template: str | None = None, 

80 csl: str | None = None, 

81 number_sections: bool = True, 

82 args: list[str] | None = None, 

83 resolve_resources: Callable = lambda x: None, 

84 overwrite: bool = False) -> File: 

85 """ 

86 Invoke pandoc. 

87 

88 :param source_file: the source file 

89 :param dest_file: the destination file 

90 :param format_in: the input format 

91 :param format_out: the output format 

92 :param standalone: should we produce a stand-alone document? 

93 :param tabstops: the number of spaces with which we replace 

94 a tab character, or None to not replace 

95 :param toc_print: should we print the table of contents 

96 :param toc_depth: the depth of the table of contents 

97 :param crossref: should we use crossref 

98 :param bibliography: should we use a bibliography 

99 :param template: which template should we use, if any? 

100 :param csl: which csl file should we use, if any? 

101 :param number_sections: should sections be numbered? 

102 :param locale: the language to be used for compiling 

103 :param args: any additional arguments 

104 :param resolve_resources: a function to resolve resources 

105 :param overwrite: should the output file be overwritten if it exists? 

106 :return: the Path to the generated output file and it size 

107 """ 

108 if not has_tool(TOOL_PANDOC): 

109 raise ValueError("Pandoc is not installed.") 

110 

111 output_file = Path.path(dest_file) 

112 if (not overwrite) and os.path.exists(output_file): 

113 raise ValueError(f"Output file '{output_file}' already exists.") 

114 input_file = Path.file(source_file) 

115 if input_file == output_file: 

116 raise ValueError( 

117 f"Input '{input_file}' must differ from output '{output_file}'.") 

118 output_dir = Path.path(os.path.dirname(output_file)) 

119 output_dir.ensure_dir_exists() 

120 input_dir = Path.directory(os.path.dirname(input_file)) 

121 

122 logger(f"applying pandoc to generate output file '{output_file}' " 

123 f"from '{input_file}'.") 

124 

125 format_in = enforce_non_empty_str_without_ws(format_in) 

126 format_out = enforce_non_empty_str_without_ws(format_out) 

127 

128 if format_in.startswith("markdown"): 

129 format_in = "+".join([format_in, 

130 "definition_lists", 

131 "smart", 

132 "fenced_code_blocks", 

133 "fenced_code_attributes", 

134 "line_blocks", 

135 "inline_code_attributes", 

136 "latex_macros", 

137 "implicit_figures", 

138 "pipe_tables", 

139 "raw_attribute"]) 

140 cmd: Final[list[str]] = [TOOL_PANDOC, 

141 f"--from={format_in}", 

142 f"--write={format_out}", 

143 f"--output={output_file}", 

144 "--fail-if-warnings", 

145 "--strip-comments"] 

146 

147 if tabstops is not None: 

148 if tabstops <= 0: 

149 raise ValueError(f"tabstops cannot be {tabstops}.") 

150 cmd.append(f"--tab-stop={tabstops}") 

151 

152 if standalone: 

153 cmd.append("--standalone") 

154 

155 if number_sections: 

156 cmd.append("--number-sections") 

157 

158 if toc_print: 

159 cmd.append("--table-of-contents") 

160 if toc_depth is not None: 

161 if toc_depth <= 0: 

162 raise ValueError(f"toc_depth cannot be {toc_depth}.") 

163 cmd.append(f"--toc-depth={toc_depth}") 

164 

165 template_file: Path | None = None 

166 if template is not None: 

167 template = enforce_non_empty_str_without_ws(template) 

168 template_file = resolve_resources(template, input_dir) 

169 if template_file is not None: 

170 template_file.enforce_file() 

171 template = template_file 

172 cmd.append(f"--template={template}") 

173 

174 if crossref: 

175 cmd.append("--filter=pandoc-crossref") 

176 

177 csl_file: Path | None = None 

178 if bibliography: 

179 cmd.append("--citeproc") 

180 if csl is not None: 

181 csl = enforce_non_empty_str_without_ws(csl) 

182 csl_file = resolve_resources(csl, input_dir) 

183 if csl_file is not None: 

184 csl_file.enforce_file() 

185 csl = csl_file 

186 cmd.append(f"--csl={csl}") 

187 

188 if args is not None: 

189 cmd.extend([enforce_non_empty_str(a).strip() 

190 for a in args]) 

191 cmd.append(input_file) 

192 

193 if locale is not None: 

194 locale = enforce_non_empty_str_without_ws(locale) 

195 cmd.append(f"-V lang={locale.replace('_', '-')}") 

196 

197 try: 

198 shell(cmd, timeout=600, cwd=input_dir, exit_code_to_str=__EXIT_CODES, 

199 check_stderr=__pandoc_check_stderr) 

200 finally: 

201 if template_file: 

202 os.remove(template_file) 

203 if csl_file: 

204 os.remove(csl_file) 

205 

206 res = File(output_file) 

207 

208 logger(f"finished applying pandoc, got output file " 

209 f"'{res.path}' of size {res.size} bytes.") 

210 return res 

211 

212 

213def latex(source_file: str, 

214 dest_file: str, 

215 format_in: str = bc.PANDOC_FORMAT_MARKDOWN, 

216 locale: str | None = None, 

217 standalone: bool = True, 

218 tabstops: int | None = 2, 

219 toc_print: bool = True, 

220 toc_depth: int = 3, 

221 crossref: bool = True, 

222 bibliography: bool = True, 

223 number_sections: bool = True, 

224 top_level_division: str = "chapter", 

225 use_listings: bool = False, 

226 get_meta: Callable = lambda x: None, 

227 resolve_resources: Callable = lambda x: None) -> File: 

228 """ 

229 Invoke pandoc to build LaTeX and then PDF output. 

230 

231 :param source_file: the source file 

232 :param dest_file: the destination file 

233 :param format_in: the input format 

234 :param locale: the language to be used for compiling 

235 :param standalone: should we produce a stand-alone document? 

236 :param tabstops: the number of spaces with which we replace 

237 a tab character, or None to not replace 

238 :param toc_print: should we print the table of contents 

239 :param toc_depth: the depth of the table of contents 

240 :param crossref: should we use crossref 

241 :param bibliography: should we use a bibliography 

242 :param number_sections: should sections be numbered? 

243 :param top_level_division: the top-level division 

244 :param use_listings: should the listings package be used? 

245 :param get_meta: a function to access meta-data 

246 :param resolve_resources: a function to resolve resources 

247 :return: the Path to the generated output file and it size 

248 """ 

249 args = [] 

250 if locale is not None: 

251 locale = enforce_non_empty_str_without_ws(locale) 

252 if locale == "zh" or locale.startswith(("zh-", "zh_")): 

253 args.append("--pdf-engine=xelatex") 

254 top_level_division = enforce_non_empty_str_without_ws(top_level_division) 

255 args.append(f"--top-level-division={top_level_division}") 

256 if use_listings: 

257 args.append("--listings") 

258 

259 with TempFile.create(suffix=".pdf") as tf: 

260 res = pandoc(source_file=source_file, 

261 dest_file=tf, 

262 format_in=format_in, 

263 format_out=bc.PANDOC_FORMAT_LATEX, 

264 standalone=standalone, 

265 tabstops=tabstops, 

266 toc_print=toc_print, 

267 toc_depth=toc_depth, 

268 crossref=crossref, 

269 bibliography=bibliography, 

270 template=get_meta(bc.PANDOC_TEMPLATE_LATEX), 

271 csl=get_meta(bc.PANDOC_CSL), 

272 number_sections=number_sections, 

273 locale=locale, 

274 resolve_resources=resolve_resources, 

275 args=args, 

276 overwrite=True).path 

277 res = pdf_postprocess(in_file=res, 

278 out_file=dest_file) 

279 return File(res) 

280 

281 

282def html(source_file: str, 

283 dest_file: str, 

284 format_in: str = bc.PANDOC_FORMAT_MARKDOWN, 

285 locale: str | None = None, 

286 standalone: bool = True, 

287 tabstops: int | None = 2, 

288 toc_print: bool = True, 

289 toc_depth: int = 3, 

290 crossref: bool = True, 

291 bibliography: bool = True, 

292 number_sections: bool = True, 

293 get_meta: Callable = lambda x: None, 

294 resolve_resources: Callable = lambda x: None) -> File: 

295 """ 

296 Invoke pandoc to build HTML output. 

297 

298 :param source_file: the source file 

299 :param dest_file: the destination file 

300 :param format_in: the input format 

301 :param locale: the language to be used for compiling 

302 :param standalone: should we produce a stand-alone document? 

303 :param tabstops: the number of spaces with which we replace 

304 a tab character, or None to not replace 

305 :param toc_print: should we print the table of contents 

306 :param toc_depth: the depth of the table of contents 

307 :param crossref: should we use crossref 

308 :param bibliography: should we use a bibliography 

309 :param number_sections: should sections be numbered? 

310 :param get_meta: a function to access meta-data 

311 :param resolve_resources: a function to resolve resources 

312 :return: the Path to the generated output file and it size 

313 :rtype: File 

314 """ 

315 endresult: Path | None = None # nosem # type: ignore # nolint 

316 try: 

317 with TempFile.create(suffix=".html") as tmp: 

318 # noinspection PyUnusedLocal 

319 inner_file: Path | None = None # nosem # type: ignore 

320 try: 

321 with ResourceServer() as serv: 

322 inner_file = pandoc( 

323 source_file=source_file, 

324 dest_file=tmp, 

325 format_in=format_in, 

326 format_out=bc.PANDOC_FORMAT_HTML5, 

327 locale=locale, 

328 standalone=standalone, 

329 tabstops=tabstops, 

330 toc_print=toc_print, 

331 toc_depth=toc_depth, 

332 crossref=crossref, 

333 bibliography=bibliography, 

334 template=get_meta(bc.PANDOC_TEMPLATE_HTML5), 

335 csl=get_meta(bc.PANDOC_CSL), 

336 number_sections=number_sections, 

337 resolve_resources=resolve_resources, 

338 overwrite=True, 

339 args=[f"--mathjax={serv.get_mathjax_url()}", 

340 "--ascii", "--html-q-tags", 

341 # eventually replace --self-contained with 

342 # "--embed-resources" 

343 "--self-contained"]).path 

344 if inner_file is not None: 

345 inner_file.enforce_file() 

346 except BaseException as ve: 

347 if isinstance(ve, ValueError): 

348 raise 

349 raise ValueError from ve 

350 

351 if inner_file is None: 

352 raise ValueError("Huh? pandoc did not return a file?") # noqa 

353 

354 if bibliography: 

355 # For some reason, the id and the text of each bibliography 

356 # item are each put into separate divs of classes for which 

357 # no styles are given. Therefore, we convert these divs to 

358 # spans and add some vertical spacing. 

359 text = enforce_non_empty_str( 

360 inner_file.read_all_str().strip()) 

361 end = text.rfind('<div id="refs"') 

362 if end > 0: 

363 text_1 = text[:end] 

364 text_2 = text[end:] 

365 del text 

366 

367 text_2 = regex_sub( 

368 '\\s*<div\\s+class="\\s*csl-left-margin\\s*"\\s*>' 

369 '\\s*(.*?)\\s*</div>\\s*', 

370 '<span class="csl-left-margin">\\1</span>&nbsp;', 

371 text_2) 

372 

373 text_2 = regex_sub( 

374 '\\s*<div\\s+class="\\s*csl-right-inline\\s*"\\s*>' 

375 '\\s*(.*?)\\s*</div>\\s*', 

376 '<span class="csl-right-inline">\\1</span>', 

377 text_2) 

378 

379 text_2 = text_2.replace( 

380 ' class="csl-entry" role="doc-biblioentry">', 

381 ' class="csl-entry" role="doc-biblioentry" ' 

382 'style="margin-top:0.33em">') 

383 

384 inner_file.write_all([text_1, text_2]) 

385 endresult = html_postprocess(in_file=inner_file, 

386 out_file=dest_file, 

387 flatten_data_uris=True, 

388 fully_evaluate_html=True, 

389 purge_scripts=True, 

390 minify=True, 

391 purge_mathjax=True, 

392 canonicalize_ids=True, 

393 overwrite=False) 

394 except BaseException as be: 

395 if isinstance(be, ValueError): 

396 raise 

397 raise ValueError from be 

398 

399 if endresult is None: 

400 raise ValueError("end result is still None?") 

401 return File(endresult) 

402 

403 

404def epub(source_file: str, 

405 dest_file: str, 

406 format_in: str = bc.PANDOC_FORMAT_MARKDOWN, 

407 locale: str | None = None, 

408 standalone: bool = True, 

409 tabstops: int | None = 2, 

410 toc_print: bool = True, 

411 toc_depth: int = 3, 

412 crossref: bool = True, 

413 bibliography: bool = True, 

414 number_sections: bool = True, 

415 get_meta: Callable = lambda x: None, 

416 resolve_resources: Callable = lambda x: None) -> File: 

417 """ 

418 Invoke pandoc to build epub output. 

419 

420 :param source_file: the source file 

421 :param dest_file: the destination file 

422 :param format_in: the input format 

423 :param locale: the language to be used for compiling 

424 :param standalone: should we produce a stand-alone document? 

425 :param tabstops: the number of spaces with which we replace 

426 a tab character, or None to not replace 

427 :param toc_print: should we print the table of contents 

428 :param toc_depth: the depth of the table of contents 

429 :param crossref: should we use crossref 

430 :param bibliography: should we use a bibliography 

431 :param number_sections: should sections be numbered? 

432 :param get_meta: a function to access meta-data 

433 :param resolve_resources: a function to resolve resources 

434 :return: the Path to the generated output file and it size 

435 """ 

436 return pandoc(source_file=source_file, 

437 dest_file=dest_file, 

438 format_in=format_in, 

439 format_out=bc.PANDOC_FORMAT_EPUB, 

440 locale=locale, 

441 standalone=standalone, 

442 tabstops=tabstops, 

443 toc_print=toc_print, 

444 toc_depth=toc_depth, 

445 crossref=crossref, 

446 bibliography=bibliography, 

447 template=get_meta(bc.PANDOC_TEMPLATE_EPUB), 

448 csl=get_meta(bc.PANDOC_CSL), 

449 number_sections=number_sections, 

450 resolve_resources=resolve_resources, 

451 args=["--mathml", "--ascii", "--html-q-tags", 

452 "--self-contained"]) 

453 

454 

455def azw3(epub_file: str) -> File: 

456 """ 

457 Convert an epub book into an azw3 one. 

458 

459 :param epub_file: the epub file 

460 :return: the azw3 file 

461 """ 

462 input_file = Path.file(epub_file) 

463 input_dir = Path.directory(os.path.dirname(input_file)) 

464 filename, _ = Path.split_prefix_suffix(os.path.basename(input_file)) 

465 dest_file = Path.resolve_inside(input_dir, f"{filename}.azw3") 

466 cmd = ["ebook-convert", input_file, dest_file, "--embed-all-fonts"] 

467 shell(cmd, timeout=360, cwd=input_dir) 

468 

469 return File(dest_file)