Coverage for bookbuilderpy/pandoc.py: 14%
148 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-17 23:15 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-17 23:15 +0000
1"""A routine for invoking pandoc."""
3import os.path
4from typing import Callable, Final
6import bookbuilderpy.constants as bc
7from bookbuilderpy.build_result import File
8from bookbuilderpy.html import html_postprocess
9from bookbuilderpy.logger import logger
10from bookbuilderpy.path import Path
11from bookbuilderpy.pdf import pdf_postprocess
12from bookbuilderpy.resources import ResourceServer
13from bookbuilderpy.shell import shell
14from bookbuilderpy.strings import (
15 enforce_non_empty_str,
16 enforce_non_empty_str_without_ws,
17 regex_sub,
18)
19from bookbuilderpy.temp import TempFile
20from bookbuilderpy.versions import TOOL_PANDOC, has_tool
22#: The meanings of the pandoc exit codes.
23__EXIT_CODES: dict[int, str] = {
24 3: "PandocFailOnWarningError",
25 4: "PandocAppError",
26 5: "PandocTemplateError",
27 6: "PandocOptionError",
28 21: "PandocUnknownReaderError",
29 22: "PandocUnknownWriterError",
30 23: "PandocUnsupportedExtensionError",
31 24: "PandocCiteprocError",
32 31: "PandocEpubSubdirectoryError",
33 43: "PandocPDFError",
34 44: "PandocXMLError",
35 47: "PandocPDFProgramNotFoundError",
36 61: "PandocHttpError",
37 62: "PandocShouldNeverHappenError",
38 63: "PandocSomeError",
39 64: "PandocParseError",
40 65: "PandocParsecError",
41 66: "PandocMakePDFError",
42 67: "PandocSyntaxMapError",
43 83: "PandocFilterError",
44 91: "PandocMacroLoop",
45 92: "PandocUTF8DecodingError",
46 93: "PandocIpynbDecodingError",
47 94: "PandocUnsupportedCharsetError",
48 97: "PandocCouldNotFindDataFileError",
49 99: "PandocResourceNotFound",
50}
53def __pandoc_check_stderr(stderr: str) -> BaseException | None:
54 """
55 Check the standard error output of pandoc.
57 :param stderr: the standard error string
58 """
59 if stderr is None:
60 return None
61 if "Undefined cross-reference" in stderr:
62 return ValueError("Undefined cross-reference!")
63 if "[WARNING] Citeproc" in stderr:
64 return ValueError("Undefined citation!")
65 return None
68def pandoc(source_file: str,
69 dest_file: str,
70 format_in: str = bc.PANDOC_FORMAT_MARKDOWN,
71 format_out: str = bc.PANDOC_FORMAT_LATEX,
72 locale: str | None = None,
73 standalone: bool = True,
74 tabstops: int | None = 2,
75 toc_print: bool = True,
76 toc_depth: int = 3,
77 crossref: bool = True,
78 bibliography: bool = True,
79 template: str | None = None,
80 csl: str | None = None,
81 number_sections: bool = True,
82 args: list[str] | None = None,
83 resolve_resources: Callable = lambda x: None,
84 overwrite: bool = False) -> File:
85 """
86 Invoke pandoc.
88 :param source_file: the source file
89 :param dest_file: the destination file
90 :param format_in: the input format
91 :param format_out: the output format
92 :param standalone: should we produce a stand-alone document?
93 :param tabstops: the number of spaces with which we replace
94 a tab character, or None to not replace
95 :param toc_print: should we print the table of contents
96 :param toc_depth: the depth of the table of contents
97 :param crossref: should we use crossref
98 :param bibliography: should we use a bibliography
99 :param template: which template should we use, if any?
100 :param csl: which csl file should we use, if any?
101 :param number_sections: should sections be numbered?
102 :param locale: the language to be used for compiling
103 :param args: any additional arguments
104 :param resolve_resources: a function to resolve resources
105 :param overwrite: should the output file be overwritten if it exists?
106 :return: the Path to the generated output file and it size
107 """
108 if not has_tool(TOOL_PANDOC):
109 raise ValueError("Pandoc is not installed.")
111 output_file = Path.path(dest_file)
112 if (not overwrite) and os.path.exists(output_file):
113 raise ValueError(f"Output file '{output_file}' already exists.")
114 input_file = Path.file(source_file)
115 if input_file == output_file:
116 raise ValueError(
117 f"Input '{input_file}' must differ from output '{output_file}'.")
118 output_dir = Path.path(os.path.dirname(output_file))
119 output_dir.ensure_dir_exists()
120 input_dir = Path.directory(os.path.dirname(input_file))
122 logger(f"applying pandoc to generate output file '{output_file}' "
123 f"from '{input_file}'.")
125 format_in = enforce_non_empty_str_without_ws(format_in)
126 format_out = enforce_non_empty_str_without_ws(format_out)
128 if format_in.startswith("markdown"):
129 format_in = "+".join([format_in,
130 "definition_lists",
131 "smart",
132 "fenced_code_blocks",
133 "fenced_code_attributes",
134 "line_blocks",
135 "inline_code_attributes",
136 "latex_macros",
137 "implicit_figures",
138 "pipe_tables",
139 "raw_attribute"])
140 cmd: Final[list[str]] = [TOOL_PANDOC,
141 f"--from={format_in}",
142 f"--write={format_out}",
143 f"--output={output_file}",
144 "--fail-if-warnings",
145 "--strip-comments"]
147 if tabstops is not None:
148 if tabstops <= 0:
149 raise ValueError(f"tabstops cannot be {tabstops}.")
150 cmd.append(f"--tab-stop={tabstops}")
152 if standalone:
153 cmd.append("--standalone")
155 if number_sections:
156 cmd.append("--number-sections")
158 if toc_print:
159 cmd.append("--table-of-contents")
160 if toc_depth is not None:
161 if toc_depth <= 0:
162 raise ValueError(f"toc_depth cannot be {toc_depth}.")
163 cmd.append(f"--toc-depth={toc_depth}")
165 template_file: Path | None = None
166 if template is not None:
167 template = enforce_non_empty_str_without_ws(template)
168 template_file = resolve_resources(template, input_dir)
169 if template_file is not None:
170 template_file.enforce_file()
171 template = template_file
172 cmd.append(f"--template={template}")
174 if crossref:
175 cmd.append("--filter=pandoc-crossref")
177 csl_file: Path | None = None
178 if bibliography:
179 cmd.append("--citeproc")
180 if csl is not None:
181 csl = enforce_non_empty_str_without_ws(csl)
182 csl_file = resolve_resources(csl, input_dir)
183 if csl_file is not None:
184 csl_file.enforce_file()
185 csl = csl_file
186 cmd.append(f"--csl={csl}")
188 if args is not None:
189 cmd.extend([enforce_non_empty_str(a).strip()
190 for a in args])
191 cmd.append(input_file)
193 if locale is not None:
194 locale = enforce_non_empty_str_without_ws(locale)
195 cmd.append(f"-V lang={locale.replace('_', '-')}")
197 try:
198 shell(cmd, timeout=600, cwd=input_dir, exit_code_to_str=__EXIT_CODES,
199 check_stderr=__pandoc_check_stderr)
200 finally:
201 if template_file:
202 os.remove(template_file)
203 if csl_file:
204 os.remove(csl_file)
206 res = File(output_file)
208 logger(f"finished applying pandoc, got output file "
209 f"'{res.path}' of size {res.size} bytes.")
210 return res
213def latex(source_file: str,
214 dest_file: str,
215 format_in: str = bc.PANDOC_FORMAT_MARKDOWN,
216 locale: str | None = None,
217 standalone: bool = True,
218 tabstops: int | None = 2,
219 toc_print: bool = True,
220 toc_depth: int = 3,
221 crossref: bool = True,
222 bibliography: bool = True,
223 number_sections: bool = True,
224 top_level_division: str = "chapter",
225 use_listings: bool = False,
226 get_meta: Callable = lambda x: None,
227 resolve_resources: Callable = lambda x: None) -> File:
228 """
229 Invoke pandoc to build LaTeX and then PDF output.
231 :param source_file: the source file
232 :param dest_file: the destination file
233 :param format_in: the input format
234 :param locale: the language to be used for compiling
235 :param standalone: should we produce a stand-alone document?
236 :param tabstops: the number of spaces with which we replace
237 a tab character, or None to not replace
238 :param toc_print: should we print the table of contents
239 :param toc_depth: the depth of the table of contents
240 :param crossref: should we use crossref
241 :param bibliography: should we use a bibliography
242 :param number_sections: should sections be numbered?
243 :param top_level_division: the top-level division
244 :param use_listings: should the listings package be used?
245 :param get_meta: a function to access meta-data
246 :param resolve_resources: a function to resolve resources
247 :return: the Path to the generated output file and it size
248 """
249 args = []
250 if locale is not None:
251 locale = enforce_non_empty_str_without_ws(locale)
252 if locale == "zh" or locale.startswith(("zh-", "zh_")):
253 args.append("--pdf-engine=xelatex")
254 top_level_division = enforce_non_empty_str_without_ws(top_level_division)
255 args.append(f"--top-level-division={top_level_division}")
256 if use_listings:
257 args.append("--listings")
259 with TempFile.create(suffix=".pdf") as tf:
260 res = pandoc(source_file=source_file,
261 dest_file=tf,
262 format_in=format_in,
263 format_out=bc.PANDOC_FORMAT_LATEX,
264 standalone=standalone,
265 tabstops=tabstops,
266 toc_print=toc_print,
267 toc_depth=toc_depth,
268 crossref=crossref,
269 bibliography=bibliography,
270 template=get_meta(bc.PANDOC_TEMPLATE_LATEX),
271 csl=get_meta(bc.PANDOC_CSL),
272 number_sections=number_sections,
273 locale=locale,
274 resolve_resources=resolve_resources,
275 args=args,
276 overwrite=True).path
277 res = pdf_postprocess(in_file=res,
278 out_file=dest_file)
279 return File(res)
282def html(source_file: str,
283 dest_file: str,
284 format_in: str = bc.PANDOC_FORMAT_MARKDOWN,
285 locale: str | None = None,
286 standalone: bool = True,
287 tabstops: int | None = 2,
288 toc_print: bool = True,
289 toc_depth: int = 3,
290 crossref: bool = True,
291 bibliography: bool = True,
292 number_sections: bool = True,
293 get_meta: Callable = lambda x: None,
294 resolve_resources: Callable = lambda x: None) -> File:
295 """
296 Invoke pandoc to build HTML output.
298 :param source_file: the source file
299 :param dest_file: the destination file
300 :param format_in: the input format
301 :param locale: the language to be used for compiling
302 :param standalone: should we produce a stand-alone document?
303 :param tabstops: the number of spaces with which we replace
304 a tab character, or None to not replace
305 :param toc_print: should we print the table of contents
306 :param toc_depth: the depth of the table of contents
307 :param crossref: should we use crossref
308 :param bibliography: should we use a bibliography
309 :param number_sections: should sections be numbered?
310 :param get_meta: a function to access meta-data
311 :param resolve_resources: a function to resolve resources
312 :return: the Path to the generated output file and it size
313 :rtype: File
314 """
315 endresult: Path | None = None # nosem # type: ignore # nolint
316 try:
317 with TempFile.create(suffix=".html") as tmp:
318 # noinspection PyUnusedLocal
319 inner_file: Path | None = None # nosem # type: ignore
320 try:
321 with ResourceServer() as serv:
322 inner_file = pandoc(
323 source_file=source_file,
324 dest_file=tmp,
325 format_in=format_in,
326 format_out=bc.PANDOC_FORMAT_HTML5,
327 locale=locale,
328 standalone=standalone,
329 tabstops=tabstops,
330 toc_print=toc_print,
331 toc_depth=toc_depth,
332 crossref=crossref,
333 bibliography=bibliography,
334 template=get_meta(bc.PANDOC_TEMPLATE_HTML5),
335 csl=get_meta(bc.PANDOC_CSL),
336 number_sections=number_sections,
337 resolve_resources=resolve_resources,
338 overwrite=True,
339 args=[f"--mathjax={serv.get_mathjax_url()}",
340 "--ascii", "--html-q-tags",
341 # eventually replace --self-contained with
342 # "--embed-resources"
343 "--self-contained"]).path
344 if inner_file is not None:
345 inner_file.enforce_file()
346 except BaseException as ve:
347 if isinstance(ve, ValueError):
348 raise
349 raise ValueError from ve
351 if inner_file is None:
352 raise ValueError("Huh? pandoc did not return a file?") # noqa
354 if bibliography:
355 # For some reason, the id and the text of each bibliography
356 # item are each put into separate divs of classes for which
357 # no styles are given. Therefore, we convert these divs to
358 # spans and add some vertical spacing.
359 text = enforce_non_empty_str(
360 inner_file.read_all_str().strip())
361 end = text.rfind('<div id="refs"')
362 if end > 0:
363 text_1 = text[:end]
364 text_2 = text[end:]
365 del text
367 text_2 = regex_sub(
368 '\\s*<div\\s+class="\\s*csl-left-margin\\s*"\\s*>'
369 '\\s*(.*?)\\s*</div>\\s*',
370 '<span class="csl-left-margin">\\1</span> ',
371 text_2)
373 text_2 = regex_sub(
374 '\\s*<div\\s+class="\\s*csl-right-inline\\s*"\\s*>'
375 '\\s*(.*?)\\s*</div>\\s*',
376 '<span class="csl-right-inline">\\1</span>',
377 text_2)
379 text_2 = text_2.replace(
380 ' class="csl-entry" role="doc-biblioentry">',
381 ' class="csl-entry" role="doc-biblioentry" '
382 'style="margin-top:0.33em">')
384 inner_file.write_all([text_1, text_2])
385 endresult = html_postprocess(in_file=inner_file,
386 out_file=dest_file,
387 flatten_data_uris=True,
388 fully_evaluate_html=True,
389 purge_scripts=True,
390 minify=True,
391 purge_mathjax=True,
392 canonicalize_ids=True,
393 overwrite=False)
394 except BaseException as be:
395 if isinstance(be, ValueError):
396 raise
397 raise ValueError from be
399 if endresult is None:
400 raise ValueError("end result is still None?")
401 return File(endresult)
404def epub(source_file: str,
405 dest_file: str,
406 format_in: str = bc.PANDOC_FORMAT_MARKDOWN,
407 locale: str | None = None,
408 standalone: bool = True,
409 tabstops: int | None = 2,
410 toc_print: bool = True,
411 toc_depth: int = 3,
412 crossref: bool = True,
413 bibliography: bool = True,
414 number_sections: bool = True,
415 get_meta: Callable = lambda x: None,
416 resolve_resources: Callable = lambda x: None) -> File:
417 """
418 Invoke pandoc to build epub output.
420 :param source_file: the source file
421 :param dest_file: the destination file
422 :param format_in: the input format
423 :param locale: the language to be used for compiling
424 :param standalone: should we produce a stand-alone document?
425 :param tabstops: the number of spaces with which we replace
426 a tab character, or None to not replace
427 :param toc_print: should we print the table of contents
428 :param toc_depth: the depth of the table of contents
429 :param crossref: should we use crossref
430 :param bibliography: should we use a bibliography
431 :param number_sections: should sections be numbered?
432 :param get_meta: a function to access meta-data
433 :param resolve_resources: a function to resolve resources
434 :return: the Path to the generated output file and it size
435 """
436 return pandoc(source_file=source_file,
437 dest_file=dest_file,
438 format_in=format_in,
439 format_out=bc.PANDOC_FORMAT_EPUB,
440 locale=locale,
441 standalone=standalone,
442 tabstops=tabstops,
443 toc_print=toc_print,
444 toc_depth=toc_depth,
445 crossref=crossref,
446 bibliography=bibliography,
447 template=get_meta(bc.PANDOC_TEMPLATE_EPUB),
448 csl=get_meta(bc.PANDOC_CSL),
449 number_sections=number_sections,
450 resolve_resources=resolve_resources,
451 args=["--mathml", "--ascii", "--html-q-tags",
452 "--self-contained"])
455def azw3(epub_file: str) -> File:
456 """
457 Convert an epub book into an azw3 one.
459 :param epub_file: the epub file
460 :return: the azw3 file
461 """
462 input_file = Path.file(epub_file)
463 input_dir = Path.directory(os.path.dirname(input_file))
464 filename, _ = Path.split_prefix_suffix(os.path.basename(input_file))
465 dest_file = Path.resolve_inside(input_dir, f"{filename}.azw3")
466 cmd = ["ebook-convert", input_file, dest_file, "--embed-all-fonts"]
467 shell(cmd, timeout=360, cwd=input_dir)
469 return File(dest_file)