Source code for pycommons.dev.doc.process_md

"""
Process a markdown file in order to make it useful for distribution.

In order to let sphinx properly load and insert the `README.md` file into the
project's documentation, we need to process this file from the GitHub style
markdown to a variant suitable for the myst parser used in sphinx. While we
are at it, we can also turn absolute URLs from the GitHub-`README.md` file
that point to the documentation URL to relative URLs.
"""

from re import Pattern, sub
from re import compile as re_compile
from typing import Any, Callable, Final, Iterable, Mapping, cast

from pycommons.dev.url_replacer import make_url_replacer
from pycommons.types import type_error

#: detects strings of the form [xyz](#123-bla) and gives \1=xyz and \2=bla
__FIX_LINKS: Final[Pattern] = re_compile(r"(\[.+?])\(#\d+-(.+?)\)")


def __process_markdown(
        source: Iterable[str], dest: Callable[[str], Any],
        line_processor: Callable[[str], str] = lambda s: s,
        discard_until: str | None = "## 1. Introduction") -> None:
    """
    Process a markdown file in order to make it useful for distribution.

    This process changes the GitHub-style markdown to a format that the myst
    parser, which is used by sphinx, can render properly. This involves
    several issues:

    1. We discard the top-level heading.
    2. We need to move all sub-headings one step up.
    3. Furthermore, we can turn all absolute URLs pointing to the
       documentation website to local references starting with `./`.

    :param source: the source line iterable
    :param dest: the destination callable receiving the output
    :param line_processor: an optional callable for processing lines
    :param discard_until: discard all strings until reaching this line. If
        this is `None`, all lines will be used. If this is not `None`, then
        this will be the first line to be forwarded to `dest`f

    >>> lp = list()
    >>> lpp = make_url_replacer({"https://example.com/": "./"},
    ...                         {"https://example.com/A": "xyz"})
    >>> src = ["![image](https://example.com/1.jp)",
    ...        "# This is `pycommons!`",
    ...        "Table of contents",
    ...        "## 1. Introduction",
    ...        "blabla bla <https://example.com/A>!",
    ...        "## 2. Some More Text",
    ...        "We [also say](https://example.com/z/hello.txt) stuff.",
    ...        "### 2.4. Code Example",
    ...        "```",
    ...        "But [not in code](https://example.com/z/hello.txt).",
    ...        "```",
    ...        "See also [here](#24-code-example)."]
    >>> __process_markdown(src, print, lpp)
    # 1. Introduction
    blabla bla <xyz>!
    # 2. Some More Text
    We [also say](./z/hello.txt) stuff.
    ## 2.4. Code Example
    ```
    But [not in code](https://example.com/z/hello.txt).
    ```
    See also [here](#24-code-example).

    >>> try:
    ...     __process_markdown(None, print, lambda x: x, "bla")
    ... except TypeError as te:
    ...     print(te)
    source should be an instance of typing.Iterable but is None.

    >>> try:
    ...     __process_markdown(1, print, lambda x: x, "bla")
    ... except TypeError as te:
    ...     print(te)
    source should be an instance of typing.Iterable but is int, namely 1.

    >>> try:
    ...     __process_markdown([None], print, lambda x: x, "bla")
    ... except TypeError as te:
    ...     print(te)
    descriptor 'rstrip' for 'str' objects doesn't apply to a 'NoneType' object

    >>> try:
    ...     __process_markdown([1], print, lambda x: x, "bla")
    ... except TypeError as te:
    ...     print(te)
    descriptor 'rstrip' for 'str' objects doesn't apply to a 'int' object

    >>> try:
    ...     __process_markdown([""], None, lambda x: x, "bla")
    ... except TypeError as te:
    ...     print(te)
    dest should be a callable but is None.

    >>> try:
    ...     __process_markdown([""], 1, lambda x: x, "bla")
    ... except TypeError as te:
    ...     print(te)
    dest should be a callable but is int, namely 1.

    >>> try:
    ...     __process_markdown([""], print, None, "bla")
    ... except TypeError as te:
    ...     print(te)
    line_processor should be a callable but is None.

    >>> try:
    ...     __process_markdown([""], print, 1, "bla")
    ... except TypeError as te:
    ...     print(te)
    line_processor should be a callable but is int, namely 1.

    >>> try:
    ...     __process_markdown([""], print, lambda x: x, 1)
    ... except TypeError as te:
    ...     print(te)
    descriptor '__len__' requires a 'str' object but received a 'int'

    >>> try:
    ...     __process_markdown([""], print, lambda x: x, "")
    ... except ValueError as ve:
    ...     print(ve)
    discard_until cannot be ''.

    >>> __process_markdown([""], print, lambda x: x, None)
    <BLANKLINE>
    """
    if not isinstance(source, Iterable):
        raise type_error(source, "source", Iterable)
    if not callable(dest):
        raise type_error(dest, "dest", call=True)
    if not callable(line_processor):
        raise type_error(line_processor, "line_processor", call=True)

    skip: bool = False
    if discard_until is not None:
        if str.__len__(discard_until) <= 0:
            raise ValueError(f"discard_until cannot be {discard_until!r}.")
        skip = True
    else:
        discard_until = ""

    in_code: bool = False  # we only process non-code lines
    needs_newline: bool = False  # required after image lines
    add_images_anyway: bool = True
    for the_line in source:
        line = str.rstrip(the_line)  # enforce string

        # we skip everything until the introduction section
        if skip:
            the_line_lstr = str.lstrip(the_line)
            if str.__len__(the_line_lstr) <= 0:
                continue
            if the_line_lstr.startswith(discard_until):
                skip = False
            elif the_line_lstr.startswith("[![") and add_images_anyway:
                needs_newline = True
                dest(line)
                continue
            else:
                add_images_anyway = False
                continue

        if needs_newline:
            dest("")
            needs_newline = False

        if in_code:
            if line.startswith("```"):
                in_code = False  # toggle to non-code
        elif line.startswith("```"):
            in_code = True  # toggle to code
        elif line.startswith("#"):
            line = line[1:]  # move all sub-headings one step up
        else:  # e.g., fix all urls via the line processor
            line = str.rstrip(line_processor(line))

        dest(line)



[docs]
def process_markdown_for_sphinx(
        source: Iterable[str], dest: Callable[[str], Any],
        base_urls: Mapping[str, str] | None = None,
        full_urls: Mapping[str, str] | None = None,
        discard_until: str | None = "## 1. Introduction") -> None:
    """
    Process a markdown file in order to make it useful for distribution.

    This process changes the GitHub-style markdown to a format that the myst
    parser, which is used by sphinx, can render properly. This involves
    several issues:

    1. We discard the top-level heading.
    2. We need to move all sub-headings one step up.
    3. Furthermore, we can turn all absolute URLs pointing to the
       documentation website to local references starting with `./`.
    4. The myst parser drops the numerical prefixes of links, i.e., it tags
       `## 1.2. Hello` with id `hello` instead of `12-hello`. This means that
       we need to fix all references following the pattern `[xxx](#12-hello)`
       to `[xxx](#hello)`.

    :param source: the source line iterable
    :param dest: the destination callable receiving the output
    :param base_urls: a mapping of basic urls to shortcuts
    :param full_urls: a mapping of full urls to abbreviations
    :param discard_until: discard all strings until reaching this line. If
        this is `None`, all lines will be used. If this is not `None`, then
        this will be the first line to be forwarded to `dest`

    >>> lp = list()
    >>> src = ["![image](https://example.com/1.jp)",
    ...        "# This is `pycommons!`",
    ...        "Table of contents",
    ...        "## 1. Introduction",
    ...        "blabla bla <https://example.com/A>!",
    ...        "## 2. Some More Text",
    ...        "We [also say](https://example.com/z/hello.txt) stuff.",
    ...        "### 2.4. Code Example",
    ...        "```",
    ...        "But [not in code](https://example.com/z/hello.txt).",
    ...        "```",
    ...        "See also [here](#24-code-example)."]
    >>> process_markdown_for_sphinx(src, print,
    ...     {"https://example.com/": "./"},
    ...     {"https://example.com/A": "xyz"})
    # 1. Introduction
    blabla bla <xyz>!
    # 2. Some More Text
    We [also say](./z/hello.txt) stuff.
    ## 2.4. Code Example
    ```
    But [not in code](https://example.com/z/hello.txt).
    ```
    See also [here](#code-example).

    >>> try:
    ...     process_markdown_for_sphinx(None, print)
    ... except TypeError as te:
    ...     print(te)
    source should be an instance of typing.Iterable but is None.

    >>> try:
    ...     process_markdown_for_sphinx(1, print)
    ... except TypeError as te:
    ...     print(te)
    source should be an instance of typing.Iterable but is int, namely 1.

    >>> try:
    ...     process_markdown_for_sphinx([None], print)
    ... except TypeError as te:
    ...     print(te)
    descriptor 'rstrip' for 'str' objects doesn't apply to a 'NoneType' object

    >>> try:
    ...     process_markdown_for_sphinx([1], print)
    ... except TypeError as te:
    ...     print(te)
    descriptor 'rstrip' for 'str' objects doesn't apply to a 'int' object

    >>> try:
    ...     process_markdown_for_sphinx([""], None)
    ... except TypeError as te:
    ...     print(te)
    dest should be a callable but is None.

    >>> try:
    ...     process_markdown_for_sphinx([""], 1)
    ... except TypeError as te:
    ...     print(te)
    dest should be a callable but is int, namely 1.

    >>> try:
    ...     process_markdown_for_sphinx([""], print, 1, None, "bla")
    ... except TypeError as te:
    ...     print(te)
    base_urls should be an instance of typing.Mapping but is int, namely 1.

    >>> try:
    ...     process_markdown_for_sphinx([""], print, None, 1, "bla")
    ... except TypeError as te:
    ...     print(te)
    full_urls should be an instance of typing.Mapping but is int, namely 1.

    >>> try:
    ...     process_markdown_for_sphinx([""], print, None, None, 1)
    ... except TypeError as te:
    ...     print(te)
    descriptor '__len__' requires a 'str' object but received a 'int'

    >>> try:
    ...     process_markdown_for_sphinx([""], print, None, None, "")
    ... except ValueError as ve:
    ...     print(ve)
    discard_until cannot be ''.

    >>> process_markdown_for_sphinx([""], print, None, None, None)
    <BLANKLINE>
    """
    __process_markdown(source, dest, cast(
        "Callable[[str], str]", lambda s, __l=make_url_replacer(
            base_urls, full_urls): __l(sub(__FIX_LINKS, "\\1(#\\2)",
                                           s))), discard_until)