Coverage for bookbuilderpy/strings.py: 62%

1"""Some utility methods for string processing."""

2import datetime

3import re

4import string

5from typing import (

6 Callable,

7 Final,

8 Iterable,

9 Pattern,

10)

11from urllib.parse import urlparse

13from bookbuilderpy.types import type_error

16def str_to_lines(text: str) -> list[str]:

17 r"""

18 Convert a string to an iterable of lines.

20 :param text: the original text string

21 :return: the lines

23 >>> str_to_lines("\n123\n 456\n789 \n 10\n\n")

24 ['', '123', ' 456', '789 ', ' 10', '', '']

25 """

26 if not isinstance(text, str):

27 raise type_error(text, "text", str)

28 return text.split("\n")

31def lines_to_str(lines: Iterable[str],

32 trailing_newline: bool = True) -> str:

33 r"""

34 Convert an iterable of strings to a single string.

36 :param lines: the lines

37 :param trailing_newline: should the re be a newline at the end?

38 :return: the single string

40 >>> lines_to_str(["a", "b", "", "c", ""], trailing_newline=True)

41 'a\nb\n\nc\n'

42 >>> lines_to_str(["a", "b", "", "c"], trailing_newline=True)

43 'a\nb\n\nc\n'

44 >>> lines_to_str(["a", "b", "", "c"], trailing_newline=False)

45 'a\nb\n\nc'

46 >>> lines_to_str(["a", "b", "", "c", ""], trailing_newline=False)

47 'a\nb\n\nc'

48 """

49 if not isinstance(lines, Iterable):

50 raise type_error(lines, "lines", Iterable)

52 res = "\n".join(lines).rstrip()

53 if trailing_newline:

54 return res + "\n"

55 return res

58def enforce_non_empty_str(text: str) -> str:

59 """

60 Enforce that a text is a non-empty string.

62 :param text: the text

63 :returns: the text

64 :raises TypeError: if `text` is not a `str`

65 :raises ValueError: if `text` is empty

66 """

67 if not isinstance(text, str):

68 raise type_error(text, "text", str)

69 if len(text) <= 0:

70 raise ValueError(f"Non-empty str expected, but got '{text}'.")

71 return text

74def enforce_non_empty_str_without_ws(text: str) -> str:

75 """

76 Enforce that a text is a non-empty string without white space.

78 :param text: the text

79 :returns: the text

80 :raises TypeError: if `text` is not a `str`

81 :raises ValueError: if `text` is empty or contains any white space

82 characters

83 """

84 text = enforce_non_empty_str(text)

85 if any(c in text for c in string.whitespace):

86 raise ValueError(

87 f"No white space allowed in string, but got '{text}'.")

88 return text

91def datetime_to_date_str(date: datetime.datetime) -> str:

92 """

93 Convert a datetime object to a date string.

95 :param date: the date

96 :return: the date string

97 """

98 if not isinstance(date, datetime.datetime):

99 raise type_error(date, "date", datetime.datetime)

100 return date.strftime("%Y\u2011%m\u2011%d")

101

102

103def datetime_to_datetime_str(date: datetime.datetime) -> str:

104 """

105 Convert a datetime object to a date-time string.

106

107 :param date: the date

108 :return: the date-time string

109 """

110 if not isinstance(date, datetime.datetime):

111 raise type_error(date, "date", datetime.datetime)

112 return date.strftime("%Y\u2011%m\u2011%d\u00a0%H:%M\u00a0%Z")

113

114

115def enforce_url(url: str) -> str:

116 """

117 Enforce that a string is a valid url.

118

119 :param url: the url

120 :return: the url

121 """

122 enforce_non_empty_str_without_ws(url)

123 if ".." in url:

124 raise ValueError(f"Invalid url '{url}', contains '..'.")

125 res = urlparse(url)

126 if res.scheme != "ssh":

127 if res.scheme not in ("http", "https"):

128 raise ValueError(f"Invalid scheme '{res.scheme}' in url '{url}'.")

129 if "@" in url:

130 raise ValueError(

131 f"Non-ssh URL must not contain '@', but '{url}' does")

132 enforce_non_empty_str_without_ws(res.netloc)

133 enforce_non_empty_str_without_ws(res.path)

134 return res.geturl()

135

136

137def get_prefix_str(str_list: tuple[str, ...] | list[str]) -> str:

138 r"""

139 Compute the common prefix string.

140

141 :param str_list: the list of strings

142 :return: the common prefix

143

144 >>> get_prefix_str(["abc", "acd"])

145 'a'

146 >>> get_prefix_str(["xyz", "gsdf"])

147 ''

148 >>> get_prefix_str([])

149 ''

150 >>> get_prefix_str(["abx"])

151 'abx'

152 >>> get_prefix_str(("\\relative.path", "\\relative.figure",

153 ... "\\relative.code"))

154 '\\relative.'

155 """

156 if len(str_list) <= 0:

157 return ""

158 prefix_str = ""

159 len_smallest_str = min([len(str_mem) for str_mem in str_list])

160 str_list_0 = str_list[0]

161 for i in range(len_smallest_str):

162 f = str_list_0[i]

163 if len([0 for ind in range(1, len(str_list))

164 if f != str_list[ind][i]]) > 0:

165 break

166 prefix_str += f

167 return prefix_str

168

169

170#: The language to locale dictionary for base locales.

171__LANG_DICT: Final[dict[str, str]] = {

172 "en": "en_US",

173 "zh": "zh_CN",

174 "cn": "zh_CN",

175 "tw": "zh_TW",

176 "de": "de_DE",

177 "fr": "fr_FR",

178 "it": "it_IT",

179 "ja": "ja_JP",

180 "ko": "ko_KR",

181 "pt": "pt_BR",

182 "es": "es_ES",

183}

184

185

186def lang_to_locale(lang: str) -> str:

187 """

188 Convert a language ID to a locale.

189

190 :param lang: the language id

191 :return: the locale

192 """

193 lang = enforce_non_empty_str_without_ws(lang)

194 if lang in __LANG_DICT:

195 return __LANG_DICT[lang]

196 if "-" in lang:

197 return "_".join(lang.split("-"))

198 return lang

199

200

201def file_size(size: int) -> str:

202 """

203 Convert a file size to a string.

204

205 :param size: the size

206 :return: the string

207 """

208 if isinstance(size, int) and (size >= 0):

209 if size <= 0:

210 return "0 B"

211 base_size: int = 1

212 for suffix in ["B", "KiB", "MiB", "GiB", "TiB", "PiB",

213 "EiB", "ZiB", "YiB"]:

214 ret_size = int((size + base_size - 1) / base_size)

215 if ret_size >= 1024:

216 base_size *= 1024

217 continue

218 return f"{ret_size} {suffix}"

219 raise ValueError(f"Invalid size: {size}.")

220

221

222#: The dictionary with "and" concatenations

223__AND_DICT: dict[str, tuple[str, str]] = {

224 "de": (" und ", " und "),

225 "en": (" and ", ", and "),

226}

227

228

229def to_string(obj,

230 locale: str | None = None,

231 use_seq_and: bool = True) -> str:

232 """

233 Convert any object to a string, try to use a proper locale.

234

235 :param obj: the input object

236 :param locale: the locale

237 :param use_seq_and: should we use "and" in sequences?

238 :return: the string representation

239 """

240 if obj is None:

241 return "None"

242

243 if isinstance(obj, str):

244 return obj.strip()

245

246 if isinstance(obj, Iterable):

247 merge = ", "

248 if (locale is not None) and (locale.startswith("zh")):

249 merge = ","

250

251 seq = [to_string(r, locale, use_seq_and).strip() for r in obj]

252 seql = len(seq)

253 if seql == 1:

254 return seq[0]

255

256 if use_seq_and and (locale is not None):

257 ands = __AND_DICT.get(locale, None)

258 if not ands:

259 ands = __AND_DICT.get(locale.split("_")[0], None)

260 if ands:

261 if seql == 2:

262 return ands[0].join(seq)

263 res = merge.join(seq[:-1])

264 return ands[1].join([res, seq[-1]])

265

266 return merge.join(seq)

267

268 return str(obj).strip()

269

270

271def regex_sub(search: str | Pattern,

272 replace: Callable | str,

273 inside: str) -> str:

274 r"""

275 Replace all occurrences of 'search' in 'inside' with 'replace'.

276

277 :param search: the regular expression to search

278 :param replace: the regular expression to replace it with

279 :param inside: the string in which to search/replace

280 :return: the new string after the recursive replacement

281

282 >>> regex_sub('[ \t]+\n', '\n', ' bla \nxyz\tabc\t\n')

283 ' bla\nxyz\tabc\n'

284 >>> regex_sub('[0-9]A', 'X', '23A7AA')

285 '2XXA'

286 """

287 while True:

288 text = re.sub(search, replace, inside, re.MULTILINE)

289 if text is inside:

290 return inside

291 inside = text