Coverage for bookbuilderpy/strings.py: 62%

112 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-17 23:15 +0000

1"""Some utility methods for string processing.""" 

2import datetime 

3import re 

4import string 

5from typing import ( 

6 Callable, 

7 Final, 

8 Iterable, 

9 Pattern, 

10) 

11from urllib.parse import urlparse 

12 

13from bookbuilderpy.types import type_error 

14 

15 

16def str_to_lines(text: str) -> list[str]: 

17 r""" 

18 Convert a string to an iterable of lines. 

19 

20 :param text: the original text string 

21 :return: the lines 

22 

23 >>> str_to_lines("\n123\n 456\n789 \n 10\n\n") 

24 ['', '123', ' 456', '789 ', ' 10', '', ''] 

25 """ 

26 if not isinstance(text, str): 

27 raise type_error(text, "text", str) 

28 return text.split("\n") 

29 

30 

31def lines_to_str(lines: Iterable[str], 

32 trailing_newline: bool = True) -> str: 

33 r""" 

34 Convert an iterable of strings to a single string. 

35 

36 :param lines: the lines 

37 :param trailing_newline: should the re be a newline at the end? 

38 :return: the single string 

39 

40 >>> lines_to_str(["a", "b", "", "c", ""], trailing_newline=True) 

41 'a\nb\n\nc\n' 

42 >>> lines_to_str(["a", "b", "", "c"], trailing_newline=True) 

43 'a\nb\n\nc\n' 

44 >>> lines_to_str(["a", "b", "", "c"], trailing_newline=False) 

45 'a\nb\n\nc' 

46 >>> lines_to_str(["a", "b", "", "c", ""], trailing_newline=False) 

47 'a\nb\n\nc' 

48 """ 

49 if not isinstance(lines, Iterable): 

50 raise type_error(lines, "lines", Iterable) 

51 

52 res = "\n".join(lines).rstrip() 

53 if trailing_newline: 

54 return res + "\n" 

55 return res 

56 

57 

58def enforce_non_empty_str(text: str) -> str: 

59 """ 

60 Enforce that a text is a non-empty string. 

61 

62 :param text: the text 

63 :returns: the text 

64 :raises TypeError: if `text` is not a `str` 

65 :raises ValueError: if `text` is empty 

66 """ 

67 if not isinstance(text, str): 

68 raise type_error(text, "text", str) 

69 if len(text) <= 0: 

70 raise ValueError(f"Non-empty str expected, but got '{text}'.") 

71 return text 

72 

73 

74def enforce_non_empty_str_without_ws(text: str) -> str: 

75 """ 

76 Enforce that a text is a non-empty string without white space. 

77 

78 :param text: the text 

79 :returns: the text 

80 :raises TypeError: if `text` is not a `str` 

81 :raises ValueError: if `text` is empty or contains any white space 

82 characters 

83 """ 

84 text = enforce_non_empty_str(text) 

85 if any(c in text for c in string.whitespace): 

86 raise ValueError( 

87 f"No white space allowed in string, but got '{text}'.") 

88 return text 

89 

90 

91def datetime_to_date_str(date: datetime.datetime) -> str: 

92 """ 

93 Convert a datetime object to a date string. 

94 

95 :param date: the date 

96 :return: the date string 

97 """ 

98 if not isinstance(date, datetime.datetime): 

99 raise type_error(date, "date", datetime.datetime) 

100 return date.strftime("%Y\u2011%m\u2011%d") 

101 

102 

103def datetime_to_datetime_str(date: datetime.datetime) -> str: 

104 """ 

105 Convert a datetime object to a date-time string. 

106 

107 :param date: the date 

108 :return: the date-time string 

109 """ 

110 if not isinstance(date, datetime.datetime): 

111 raise type_error(date, "date", datetime.datetime) 

112 return date.strftime("%Y\u2011%m\u2011%d\u00a0%H:%M\u00a0%Z") 

113 

114 

115def enforce_url(url: str) -> str: 

116 """ 

117 Enforce that a string is a valid url. 

118 

119 :param url: the url 

120 :return: the url 

121 """ 

122 enforce_non_empty_str_without_ws(url) 

123 if ".." in url: 

124 raise ValueError(f"Invalid url '{url}', contains '..'.") 

125 res = urlparse(url) 

126 if res.scheme != "ssh": 

127 if res.scheme not in ("http", "https"): 

128 raise ValueError(f"Invalid scheme '{res.scheme}' in url '{url}'.") 

129 if "@" in url: 

130 raise ValueError( 

131 f"Non-ssh URL must not contain '@', but '{url}' does") 

132 enforce_non_empty_str_without_ws(res.netloc) 

133 enforce_non_empty_str_without_ws(res.path) 

134 return res.geturl() 

135 

136 

137def get_prefix_str(str_list: tuple[str, ...] | list[str]) -> str: 

138 r""" 

139 Compute the common prefix string. 

140 

141 :param str_list: the list of strings 

142 :return: the common prefix 

143 

144 >>> get_prefix_str(["abc", "acd"]) 

145 'a' 

146 >>> get_prefix_str(["xyz", "gsdf"]) 

147 '' 

148 >>> get_prefix_str([]) 

149 '' 

150 >>> get_prefix_str(["abx"]) 

151 'abx' 

152 >>> get_prefix_str(("\\relative.path", "\\relative.figure", 

153 ... "\\relative.code")) 

154 '\\relative.' 

155 """ 

156 if len(str_list) <= 0: 

157 return "" 

158 prefix_str = "" 

159 len_smallest_str = min([len(str_mem) for str_mem in str_list]) 

160 str_list_0 = str_list[0] 

161 for i in range(len_smallest_str): 

162 f = str_list_0[i] 

163 if len([0 for ind in range(1, len(str_list)) 

164 if f != str_list[ind][i]]) > 0: 

165 break 

166 prefix_str += f 

167 return prefix_str 

168 

169 

170#: The language to locale dictionary for base locales. 

171__LANG_DICT: Final[dict[str, str]] = { 

172 "en": "en_US", 

173 "zh": "zh_CN", 

174 "cn": "zh_CN", 

175 "tw": "zh_TW", 

176 "de": "de_DE", 

177 "fr": "fr_FR", 

178 "it": "it_IT", 

179 "ja": "ja_JP", 

180 "ko": "ko_KR", 

181 "pt": "pt_BR", 

182 "es": "es_ES", 

183} 

184 

185 

186def lang_to_locale(lang: str) -> str: 

187 """ 

188 Convert a language ID to a locale. 

189 

190 :param lang: the language id 

191 :return: the locale 

192 """ 

193 lang = enforce_non_empty_str_without_ws(lang) 

194 if lang in __LANG_DICT: 

195 return __LANG_DICT[lang] 

196 if "-" in lang: 

197 return "_".join(lang.split("-")) 

198 return lang 

199 

200 

201def file_size(size: int) -> str: 

202 """ 

203 Convert a file size to a string. 

204 

205 :param size: the size 

206 :return: the string 

207 """ 

208 if isinstance(size, int) and (size >= 0): 

209 if size <= 0: 

210 return "0 B" 

211 base_size: int = 1 

212 for suffix in ["B", "KiB", "MiB", "GiB", "TiB", "PiB", 

213 "EiB", "ZiB", "YiB"]: 

214 ret_size = int((size + base_size - 1) / base_size) 

215 if ret_size >= 1024: 

216 base_size *= 1024 

217 continue 

218 return f"{ret_size} {suffix}" 

219 raise ValueError(f"Invalid size: {size}.") 

220 

221 

222#: The dictionary with "and" concatenations 

223__AND_DICT: dict[str, tuple[str, str]] = { 

224 "de": (" und ", " und "), 

225 "en": (" and ", ", and "), 

226} 

227 

228 

229def to_string(obj, 

230 locale: str | None = None, 

231 use_seq_and: bool = True) -> str: 

232 """ 

233 Convert any object to a string, try to use a proper locale. 

234 

235 :param obj: the input object 

236 :param locale: the locale 

237 :param use_seq_and: should we use "and" in sequences? 

238 :return: the string representation 

239 """ 

240 if obj is None: 

241 return "None" 

242 

243 if isinstance(obj, str): 

244 return obj.strip() 

245 

246 if isinstance(obj, Iterable): 

247 merge = ", " 

248 if (locale is not None) and (locale.startswith("zh")): 

249 merge = "," 

250 

251 seq = [to_string(r, locale, use_seq_and).strip() for r in obj] 

252 seql = len(seq) 

253 if seql == 1: 

254 return seq[0] 

255 

256 if use_seq_and and (locale is not None): 

257 ands = __AND_DICT.get(locale, None) 

258 if not ands: 

259 ands = __AND_DICT.get(locale.split("_")[0], None) 

260 if ands: 

261 if seql == 2: 

262 return ands[0].join(seq) 

263 res = merge.join(seq[:-1]) 

264 return ands[1].join([res, seq[-1]]) 

265 

266 return merge.join(seq) 

267 

268 return str(obj).strip() 

269 

270 

271def regex_sub(search: str | Pattern, 

272 replace: Callable | str, 

273 inside: str) -> str: 

274 r""" 

275 Replace all occurrences of 'search' in 'inside' with 'replace'. 

276 

277 :param search: the regular expression to search 

278 :param replace: the regular expression to replace it with 

279 :param inside: the string in which to search/replace 

280 :return: the new string after the recursive replacement 

281 

282 >>> regex_sub('[ \t]+\n', '\n', ' bla \nxyz\tabc\t\n') 

283 ' bla\nxyz\tabc\n' 

284 >>> regex_sub('[0-9]A', 'X', '23A7AA') 

285 '2XXA' 

286 """ 

287 while True: 

288 text = re.sub(search, replace, inside, re.MULTILINE) 

289 if text is inside: 

290 return inside 

291 inside = text