Source code for bookbuilderpy.strings

"""Some utility methods for string processing."""
import datetime
import re
import string
from typing import (
    Callable,
    Final,
    Iterable,
    Pattern,
)
from urllib.parse import urlparse

from bookbuilderpy.types import type_error


[docs]def str_to_lines(text: str) -> list[str]:
    r"""
    Convert a string to an iterable of lines.

    :param text: the original text string
    :return: the lines

    >>> str_to_lines("\n123\n  456\n789 \n 10\n\n")
    ['', '123', '  456', '789 ', ' 10', '', '']
    """
    if not isinstance(text, str):
        raise type_error(text, "text", str)
    return text.split("\n")


[docs]def lines_to_str(lines: Iterable[str],
                 trailing_newline: bool = True) -> str:
    r"""
    Convert an iterable of strings to a single string.

    :param lines: the lines
    :param trailing_newline: should the re be a newline at the end?
    :return: the single string

    >>> lines_to_str(["a", "b", "", "c", ""], trailing_newline=True)
    'a\nb\n\nc\n'
    >>> lines_to_str(["a", "b", "", "c"], trailing_newline=True)
    'a\nb\n\nc\n'
    >>> lines_to_str(["a", "b", "", "c"], trailing_newline=False)
    'a\nb\n\nc'
    >>> lines_to_str(["a", "b", "", "c", ""], trailing_newline=False)
    'a\nb\n\nc'
    """
    if not isinstance(lines, Iterable):
        raise type_error(lines, "lines", Iterable)

    res = "\n".join(lines).rstrip()
    if trailing_newline:
        return res + "\n"
    return res


[docs]def enforce_non_empty_str(text: str) -> str:
    """
    Enforce that a text is a non-empty string.

    :param text: the text
    :returns: the text
    :raises TypeError: if `text` is not a `str`
    :raises ValueError: if `text` is empty
    """
    if not isinstance(text, str):
        raise type_error(text, "text", str)
    if len(text) <= 0:
        raise ValueError(f"Non-empty str expected, but got '{text}'.")
    return text


[docs]def enforce_non_empty_str_without_ws(text: str) -> str:
    """
    Enforce that a text is a non-empty string without white space.

    :param text: the text
    :returns: the text
    :raises TypeError: if `text` is not a `str`
    :raises ValueError: if `text` is empty or contains any white space
        characters
    """
    text = enforce_non_empty_str(text)
    if any(c in text for c in string.whitespace):
        raise ValueError(
            f"No white space allowed in string, but got '{text}'.")
    return text


[docs]def datetime_to_date_str(date: datetime.datetime) -> str:
    """
    Convert a datetime object to a date string.

    :param date: the date
    :return: the date string
    """
    if not isinstance(date, datetime.datetime):
        raise type_error(date, "date", datetime.datetime)
    return date.strftime("%Y\u2011%m\u2011%d")


[docs]def datetime_to_datetime_str(date: datetime.datetime) -> str:
    """
    Convert a datetime object to a date-time string.

    :param date: the date
    :return: the date-time string
    """
    if not isinstance(date, datetime.datetime):
        raise type_error(date, "date", datetime.datetime)
    return date.strftime("%Y\u2011%m\u2011%d\u00a0%H:%M\u00a0%Z")


[docs]def enforce_url(url: str) -> str:
    """
    Enforce that a string is a valid url.

    :param url: the url
    :return: the url
    """
    enforce_non_empty_str_without_ws(url)
    if ".." in url:
        raise ValueError(f"Invalid url '{url}', contains '..'.")
    res = urlparse(url)
    if res.scheme != "ssh":
        if res.scheme not in ("http", "https"):
            raise ValueError(f"Invalid scheme '{res.scheme}' in url '{url}'.")
        if "@" in url:
            raise ValueError(
                f"Non-ssh URL must not contain '@', but '{url}' does")
    enforce_non_empty_str_without_ws(res.netloc)
    enforce_non_empty_str_without_ws(res.path)
    return res.geturl()


[docs]def get_prefix_str(str_list: tuple[str, ...] | list[str]) -> str:
    r"""
    Compute the common prefix string.

    :param str_list: the list of strings
    :return: the common prefix

    >>> get_prefix_str(["abc", "acd"])
    'a'
    >>> get_prefix_str(["xyz", "gsdf"])
    ''
    >>> get_prefix_str([])
    ''
    >>> get_prefix_str(["abx"])
    'abx'
    >>> get_prefix_str(("\\relative.path", "\\relative.figure",
    ...     "\\relative.code"))
    '\\relative.'
    """
    if len(str_list) <= 0:
        return ""
    prefix_str = ""
    len_smallest_str = min([len(str_mem) for str_mem in str_list])
    str_list_0 = str_list[0]
    for i in range(len_smallest_str):
        f = str_list_0[i]
        if len([0 for ind in range(1, len(str_list))
                if f != str_list[ind][i]]) > 0:
            break
        prefix_str += f
    return prefix_str


#: The language to locale dictionary for base locales.
__LANG_DICT: Final[dict[str, str]] = {
    "en": "en_US",
    "zh": "zh_CN",
    "cn": "zh_CN",
    "tw": "zh_TW",
    "de": "de_DE",
    "fr": "fr_FR",
    "it": "it_IT",
    "ja": "ja_JP",
    "ko": "ko_KR",
    "pt": "pt_BR",
    "es": "es_ES",
}


[docs]def lang_to_locale(lang: str) -> str:
    """
    Convert a language ID to a locale.

    :param lang: the language id
    :return: the locale
    """
    lang = enforce_non_empty_str_without_ws(lang)
    if lang in __LANG_DICT:
        return __LANG_DICT[lang]
    if "-" in lang:
        return "_".join(lang.split("-"))
    return lang


[docs]def file_size(size: int) -> str:
    """
    Convert a file size to a string.

    :param size: the size
    :return: the string
    """
    if isinstance(size, int) and (size >= 0):
        if size <= 0:
            return "0 B"
        base_size: int = 1
        for suffix in ["B", "KiB", "MiB", "GiB", "TiB", "PiB",
                       "EiB", "ZiB", "YiB"]:
            ret_size = int((size + base_size - 1) / base_size)
            if ret_size >= 1024:
                base_size *= 1024
                continue
            return f"{ret_size} {suffix}"
    raise ValueError(f"Invalid size: {size}.")


#: The dictionary with "and" concatenations
__AND_DICT: dict[str, tuple[str, str]] = {
    "de": (" und ", " und "),
    "en": (" and ", ", and "),
}


[docs]def to_string(obj,
              locale: str | None = None,
              use_seq_and: bool = True) -> str:
    """
    Convert any object to a string, try to use a proper locale.

    :param obj: the input object
    :param locale: the locale
    :param use_seq_and: should we use "and" in sequences?
    :return: the string representation
    """
    if obj is None:
        return "None"

    if isinstance(obj, str):
        return obj.strip()

    if isinstance(obj, Iterable):
        merge = ", "
        if (locale is not None) and (locale.startswith("zh")):
            merge = ","

        seq = [to_string(r, locale, use_seq_and).strip() for r in obj]
        seql = len(seq)
        if seql == 1:
            return seq[0]

        if use_seq_and and (locale is not None):
            ands = __AND_DICT.get(locale, None)
            if not ands:
                ands = __AND_DICT.get(locale.split("_")[0], None)
            if ands:
                if seql == 2:
                    return ands[0].join(seq)
                res = merge.join(seq[:-1])
                return ands[1].join([res, seq[-1]])

        return merge.join(seq)

    return str(obj).strip()


[docs]def regex_sub(search: str | Pattern,
              replace: Callable | str,
              inside: str) -> str:
    r"""
    Replace all occurrences of 'search' in 'inside' with 'replace'.

    :param search: the regular expression to search
    :param replace: the regular expression to replace it with
    :param inside: the string in which to search/replace
    :return: the new string after the recursive replacement

    >>> regex_sub('[ \t]+\n', '\n', ' bla \nxyz\tabc\t\n')
    ' bla\nxyz\tabc\n'
    >>> regex_sub('[0-9]A', 'X', '23A7AA')
    '2XXA'
    """
    while True:
        text = re.sub(search, replace, inside, re.MULTILINE)
        if text is inside:
            return inside
        inside = text