Coverage for pycommons / strings / string_tools.py: 100%
136 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
1"""Routines for handling strings."""
3from re import Match, subn
4from re import compile as _compile
5from typing import Any, Callable, Final, Generator, Iterable, Pattern, cast
7from pycommons.types import type_error
9#: fast call to :meth:`str.__len__`
10__LEN: Final[Callable[[str], int]] = cast("Callable[[str], int]", str.__len__)
13def replace_str(find: str, replace: str, src: str) -> str:
14 """
15 Perform a recursive replacement of strings.
17 After applying this function, there will not be any occurence of `find`
18 left in `src`. All of them will have been replaced by `replace`. If that
19 produces new instances of `find`, these will be replaced as well
20 *unless they do not make the string shorter*. In other words, the
21 replacement is continued only if the new string becomes shorter.
23 See :func:`replace_regex` for regular-expression based replacements.
25 :param find: the string to find
26 :param replace: the string with which it will be replaced
27 :param src: the string in which we search
28 :returns: the string `src`, with all occurrences of find replaced by
29 replace
30 :raises TypeError: if any of the parameters are not strings
32 >>> replace_str("a", "b", "abc")
33 'bbc'
34 >>> replace_str("aa", "a", "aaaaa")
35 'a'
36 >>> replace_str("aba", "a", "abaababa")
37 'aa'
38 >>> replace_str("aba", "aba", "abaababa")
39 'abaababa'
40 >>> replace_str("aa", "aa", "aaaaaaaa")
41 'aaaaaaaa'
42 >>> replace_str("a", "aa", "aaaaaaaa")
43 'aaaaaaaaaaaaaaaa'
44 >>> replace_str("a", "xx", "aaaaaaaa")
45 'xxxxxxxxxxxxxxxx'
47 >>> try:
48 ... replace_str(None, "a", "b")
49 ... except TypeError as te:
50 ... print(te)
51 replace() argument 1 must be str, not None
53 >>> try:
54 ... replace_str(1, "a", "b")
55 ... except TypeError as te:
56 ... print(te)
57 replace() argument 1 must be str, not int
59 >>> try:
60 ... replace_str("a", None, "b")
61 ... except TypeError as te:
62 ... print(te)
63 replace() argument 2 must be str, not None
65 >>> try:
66 ... replace_str("x", 1, "b")
67 ... except TypeError as te:
68 ... print(te)
69 replace() argument 2 must be str, not int
71 >>> try:
72 ... replace_str("a", "v", None)
73 ... except TypeError as te:
74 ... print(te)
75 descriptor '__len__' requires a 'str' object but received a 'NoneType'
77 >>> try:
78 ... replace_str("x", "xy", 1)
79 ... except TypeError as te:
80 ... print(te)
81 descriptor '__len__' requires a 'str' object but received a 'int'
82 """
83 new_len: int = __LEN(src)
84 while True:
85 src = src.replace(find, replace)
86 old_len: int = new_len
87 new_len = __LEN(src)
88 if new_len >= old_len:
89 return src
92def replace_regex(search: str | Pattern,
93 replace: str | Callable[[Match], str],
94 inside: str) -> str:
95 r"""
96 Replace all occurrences of 'search' in 'inside' with 'replace'.
98 This replacement procedure is done repetitively and recursively until
99 no occurrence of `search` is found anymore. This, of course, may lead
100 to an endless loop, so a `ValueError` is thrown if there are too many
101 recursive replacements.
103 :param search: the regular expression to search, either a string or a
104 pattern
105 :param replace: the string to replace it with, or a function receiving
106 a :class:`re.Match` instance and returning a replacement string
107 :param inside: the string in which to search/replace
108 :returns: the new string after the recursive replacement
109 :raises TypeError: if any of the parameters is not of the right type
110 :raises ValueError: if there are 100000 recursive replacements or more,
111 indicating that there could be an endless loop
113 >>> replace_regex('[ \t]+\n', '\n', ' bla \nxyz\tabc\t\n')
114 ' bla\nxyz\tabc\n'
115 >>> replace_regex('[0-9]A', 'X', '23A7AA')
116 '2XXA'
117 >>> from re import compile as cpx
118 >>> replace_regex(cpx('[0-9]A'), 'X', '23A7AA')
119 '2XXA'
121 >>> def __repl(a):
122 ... print(repr(a))
123 ... return "y"
124 >>> replace_regex("a.b", __repl, "albaab")
125 <re.Match object; span=(0, 3), match='alb'>
126 <re.Match object; span=(3, 6), match='aab'>
127 'yy'
129 >>> def __repl(a):
130 ... print(repr(a))
131 ... ss = a.group()
132 ... print(ss)
133 ... return "axb"
134 >>> replace_regex("aa.bb", __repl, "aaaaaxbbbbb")
135 <re.Match object; span=(3, 8), match='aaxbb'>
136 aaxbb
137 <re.Match object; span=(2, 7), match='aaxbb'>
138 aaxbb
139 <re.Match object; span=(1, 6), match='aaxbb'>
140 aaxbb
141 <re.Match object; span=(0, 5), match='aaxbb'>
142 aaxbb
143 'axb'
145 >>> replace_regex("aa.bb", "axb", "aaaaaxbbbbb")
146 'axb'
147 >>> replace_regex("aa.bb", "axb", "".join("a" * 100 + "y" + "b" * 100))
148 'axb'
149 >>> replace_regex("aa.bb", "axb",
150 ... "".join("a" * 10000 + "y" + "b" * 10000))
151 'axb'
153 >>> try:
154 ... replace_regex(1, "1", "2")
155 ... except TypeError as te:
156 ... print(str(te)[0:60])
157 search should be an instance of any in {str, typing.Pattern}
159 >>> try:
160 ... replace_regex(None, "1", "2")
161 ... except TypeError as te:
162 ... print(te)
163 search should be an instance of any in {str, typing.Pattern} but is None.
165 >>> try:
166 ... replace_regex("x", 2, "2")
167 ... except TypeError as te:
168 ... print(te)
169 replace should be an instance of str or a callable but is int, namely 2.
171 >>> try:
172 ... replace_regex("x", None, "2")
173 ... except TypeError as te:
174 ... print(te)
175 replace should be an instance of str or a callable but is None.
177 >>> try:
178 ... replace_regex(1, 1, "2")
179 ... except TypeError as te:
180 ... print(str(te)[0:60])
181 search should be an instance of any in {str, typing.Pattern}
183 >>> try:
184 ... replace_regex("yy", "1", 3)
185 ... except TypeError as te:
186 ... print(te)
187 inside should be an instance of str but is int, namely 3.
189 >>> try:
190 ... replace_regex("adad", "1", None)
191 ... except TypeError as te:
192 ... print(te)
193 inside should be an instance of str but is None.
195 >>> try:
196 ... replace_regex(1, "1", 3)
197 ... except TypeError as te:
198 ... print(str(te)[0:60])
199 search should be an instance of any in {str, typing.Pattern}
201 >>> try:
202 ... replace_regex(1, 3, 5)
203 ... except TypeError as te:
204 ... print(str(te)[0:60])
205 search should be an instance of any in {str, typing.Pattern}
207 >>> try:
208 ... replace_regex("abab|baab|bbab|aaab|aaaa|bbbb", "baba",
209 ... "ababababab")
210 ... except ValueError as ve:
211 ... print(str(ve)[:50])
212 Too many replacements, pattern re.compile('abab|ba
213 """
214 if not isinstance(search, Pattern):
215 if isinstance(search, str):
216 search = _compile(search)
217 else:
218 raise type_error(search, "search", (str, Pattern))
219 if not (isinstance(replace, str) or callable(replace)):
220 raise type_error(replace, "replace", str, call=True)
221 if not isinstance(inside, str):
222 raise type_error(inside, "inside", str)
223 for _ in range(100_000):
224 text, n = subn(pattern=search, repl=replace, string=inside)
225 if (n <= 0) or (text == inside):
226 return text
227 inside = text
228 raise ValueError(
229 f"Too many replacements, pattern {search!r} probably malformed for "
230 f"text {inside!r} and replacement {replace!r}.")
233def get_prefix_str(strings: str | Iterable[str]) -> str:
234 r"""
235 Compute the common prefix of an iterable of strings.
237 :param strings: the iterable of strings
238 :returns: the common prefix
239 :raises TypeError: if the input is not a string, iterable of string,
240 or contains any non-string element (before the prefix is determined)
241 Notice: If the prefix is determined as the empty string, then the
242 search is stopped. If some non-`str` items follow later in `strings`,
243 then these may not raise a `TypeError`
245 >>> get_prefix_str(["abc", "acd"])
246 'a'
247 >>> get_prefix_str(["xyz", "gsdf"])
248 ''
249 >>> get_prefix_str([])
250 ''
251 >>> get_prefix_str(["abx"])
252 'abx'
253 >>> get_prefix_str(("abx", ))
254 'abx'
255 >>> get_prefix_str({"abx", })
256 'abx'
257 >>> get_prefix_str("abx")
258 'abx'
259 >>> get_prefix_str(("\\relative.path", "\\relative.figure",
260 ... "\\relative.code"))
261 '\\relative.'
262 >>> get_prefix_str({"\\relative.path", "\\relative.figure",
263 ... "\\relative.code"})
264 '\\relative.'
266 >>> try:
267 ... get_prefix_str(None)
268 ... except TypeError as te:
269 ... print(te)
270 strings should be an instance of any in {str, typing.Iterable} but is None.
272 >>> try:
273 ... get_prefix_str(1)
274 ... except TypeError as te:
275 ... print(str(te)[:60])
276 strings should be an instance of any in {str, typing.Iterabl
278 >>> try:
279 ... get_prefix_str(["abc", "acd", 2, "x"])
280 ... except TypeError as te:
281 ... print(te)
282 descriptor '__len__' requires a 'str' object but received a 'int'
284 >>> try:
285 ... get_prefix_str(["abc", "acd", None, "x"])
286 ... except TypeError as te:
287 ... print(te)
288 descriptor '__len__' requires a 'str' object but received a 'NoneType'
290 >>> get_prefix_str(["xyz", "gsdf", 5])
291 ''
292 """
293 if isinstance(strings, str):
294 return strings
295 if not isinstance(strings, Iterable):
296 raise type_error(strings, "strings", (str, Iterable))
297 prefix: str | None = None
298 prefix_len: int = -1
299 for current in strings: # iterate over all strings
300 current_len: int = __LEN(current)
301 if prefix is None: # no prefix set yet
302 prefix = current
303 prefix_len = current_len
304 else: # we got a prefix
305 for i in range(min(prefix_len, current_len)):
306 if prefix[i] != current[i]:
307 prefix_len = i
308 break
309 if prefix_len == 0:
310 return ""
311 return "" if prefix is None else prefix[0:prefix_len]
314def split_str(source: str, split_by: str) -> Generator[str, None, None]:
315 """
316 Split a string by the given other string.
318 The goal is to provide a less memory intense variant of the method
319 :meth:`str.split`. This routine should iteratively divide a given string
320 based on a splitting character or string. This function may be useful if
321 we are dealing with a very big `source` string and want to iteratively
322 split it into smaller strings. Instead of creating a list with many small
323 strings, what :meth:`str.split` does, it creates these strings
324 iteratively
326 :param source: the source string
327 :param split_by: the split string
328 :returns: each split element
330 >>> list(split_str("", ""))
331 ['']
333 >>> list(split_str("", "x"))
334 ['']
336 >>> list(split_str("a", ""))
337 ['a']
339 >>> list(split_str("abc", ""))
340 ['a', 'b', 'c']
342 >>> list(split_str("a;b;c", ";"))
343 ['a', 'b', 'c']
345 >>> list(split_str("a;b;c;", ";"))
346 ['a', 'b', 'c', '']
348 >>> list(split_str(";a;b;;c;", ";"))
349 ['', 'a', 'b', '', 'c', '']
351 >>> list(split_str("a;aaa;aba;aa;aca;a", "a;a"))
352 ['', 'a', 'b', '', 'c', '']
353 """
354 src_len: Final[int] = str.__len__(source)
355 if src_len <= 0: # handle empty input strings
356 yield "" # the source is empty, so the split is empty, too
357 return # quit after returning the empty string
359 split_len: Final[int] = str.__len__(split_by)
360 if split_len <= 0: # handle empty split patterns
361 yield from source # if the split is empty, we return each character
362 return # and quit
364 start: int = 0 # now we work for non-empty split patterns
365 finder: Final[Callable[[str, int], int]] = source.find # fast call
366 while start < src_len:
367 end = finder(split_by, start)
368 if end < 0:
369 yield source[start:] if start > 0 else source
370 return # pattern not found anymore, so we quit
371 yield source[start:end]
372 start = end + split_len
373 yield "" # pattern found at the end of the string
376def escape(text: str, escapes: Iterable[str]) -> tuple[str, Any]:
377 """
378 Escapes a set of substrings inside a string in a reversible manner.
380 A set of character sequences (`escapes`) are to be removed from
381 `text` and to be replaced with characters that do not occur inside
382 `text`. Escaping is a bijection. Since all escaped sequences are replaced
383 with characters that are new to the string, there cannot be any issue with
384 recursively occuring patterns or ambigiuties.
386 Replacement is performed iteratively from beginning to end. The first
387 sequence from `escapes` that is discovered is replaced and then the
388 process continues. If two sequences start at the same place, then the
389 longer one is replaced first.
391 The same `text` with the same set of escapes will always produce the same
392 output, regardless of the order of the escapes.
394 The function returns a tuple containing the escaped string as well as the
395 setup used for the escaping (as the second element). This second element
396 must *only* be used by the function :func:`unescape`, which is the reverse
397 operator of :func:`escape`. You must not make any assumption about its
398 nature.
400 :param text: the text
401 :param escapes: the substrings to escape
402 :return: a tuple of an escaped version of the string, together with
403 the escape information.
404 The second part of the tuple must not be accessed.
406 >>> s, v = escape("12345", ("12", "X", "5"))
407 >>> print(s)
408 "34!
409 >>> print(v)
410 [('12', '"'), ('5', '!')]
412 >>> unescape(s, v)
413 '12345'
415 >>> s, v = escape('"123!45', ("12", "X", "5", "!", '"'))
416 >>> print(s)
417 $&3#4%
418 >>> print(v)
419 [('!', '#'), ('"', '$'), ('12', '&'), ('5', '%')]
421 >>> unescape(s, v)
422 '"123!45'
424 >>> s, v = escape('"123!45', ("X", "5", "12", "!", '"'))
425 >>> print(s)
426 $&3#4%
427 >>> print(v)
428 [('!', '#'), ('"', '$'), ('12', '&'), ('5', '%')]
430 >>> unescape(s, v)
431 '"123!45'
433 >>> s, v = escape('111111112222233321111212121',
434 ... ("1", "11", "2", "222", "1", "32", "321", "21", "33"))
435 >>> print(s)
436 ####'""&(#!$$$
437 >>> print(v)
438 [('1', '!'), ('11', '#'), ('2', '"'), ('21', '$'), ('222', "'"), \
439('321', '('), ('33', '&')]
441 >>> unescape(s, v)
442 '111111112222233321111212121'
444 >>> s, v = escape('111&111112222233321111212X121',
445 ... ("1", "11", "2", "222", "1", "32", "321", "21", "33"))
446 >>> print(s)
447 #!&##!(""')#!$"X!$
448 >>> print(v)
449 [('1', '!'), ('11', '#'), ('2', '"'), ('21', '$'), ('222', '('), \
450('321', ')'), ('33', "'")]
452 >>> unescape(s, v)
453 '111&111112222233321111212X121'
455 >>> s, v = escape('221', ("22", "21"))
456 >>> print(s)
457 "1
458 >>> print(v)
459 [('22', '"')]
461 >>> s, v = escape('22221', ("2222", "2221", "22", "21"))
462 >>> print(s)
463 $1
464 >>> print(v)
465 [('2222', '$')]
467 >>> unescape(s, v)
468 '22221'
470 >>> s, v = escape('222212222122221', ("2222", "2221", "22", "21"))
471 >>> print(s)
472 $1$1$1
473 >>> print(v)
474 [('2222', '$')]
476 >>> unescape(s, v)
477 '222212222122221'
479 >>> s, v = escape('222212222122221', ("2222", "2221", "22", "21", '"1'))
480 >>> print(s)
481 %1%1%1
482 >>> print(v)
483 [('2222', '%')]
485 >>> unescape(s, v)
486 '222212222122221'
488 >>> try:
489 ... escape(1, None)
490 ... except TypeError as te:
491 ... print(te)
492 descriptor '__len__' requires a 'str' object but received a 'int'
494 >>> try:
495 ... escape("x", 5)
496 ... except TypeError as te:
497 ... print(te)
498 'int' object is not iterable
500 >>> try:
501 ... escape("x", (5, ))
502 ... except TypeError as te:
503 ... print(te)
504 descriptor '__len__' requires a 'str' object but received a 'int'
506 >>> s, v = escape("", ("12", ))
507 >>> s == ""
508 True
509 >>> v is None
510 True
512 >>> s, v = escape("12", [])
513 >>> s == "12"
514 True
515 >>> v is None
516 True
518 >>> s, v = escape("12", ["3", "4", "5"])
519 >>> s == "12"
520 True
521 >>> v is None
522 True
524 >>> try:
525 ... s, v = escape("1" * 1_073_741_826, ("x", ))
526 ... except ValueError as ve:
527 ... print(ve)
528 We rather not escape a string with 1073741826 characters.
530 >>> try:
531 ... s, v = escape("".join(chr(i) for i in range(524_290)), ("x", ))
532 ... except ValueError as ve:
533 ... print(ve)
534 524290 different characters and 1 escapes are too many.
536 >>> try:
537 ... s, v = escape("123", ("x", ""))
538 ... except ValueError as ve:
539 ... print(ve)
540 Cannot escape empty string.
541 """
542 text_len: Final[int] = str.__len__(text)
543 if text_len <= 0:
544 return "", None
545 if text_len > 1_073_741_824:
546 raise ValueError(
547 f"We rather not escape a string with {text_len} characters.")
549 # check which of the escapes are actually needed
550 forbidden: Final[list[Any]] = []
551 charset: set[str] = set()
552 needs_escaping: bool = False
553 for fb in escapes:
554 if str.__len__(fb) <= 0:
555 raise ValueError("Cannot escape empty string.")
556 if fb in forbidden:
557 continue
558 charset.update(fb)
559 if fb in text:
560 forbidden.append(fb)
561 needs_escaping = True
563 forbidden_len: int = list.__len__(forbidden)
564 if (not needs_escaping) or (forbidden_len <= 0):
565 return text, None
567 # always create the same escape sequences
568 forbidden.sort()
569 # make sure to escape long sequences first
570 forbidden.sort(key=str.__len__) # type: ignore
572 # get the set of all characters in this string
573 charset.update(text)
574 char_len: Final[int] = set.__len__(charset)
575 if (char_len + forbidden_len) > 524_288:
576 raise ValueError(
577 f"{char_len} different characters and "
578 f"{forbidden_len} escapes are too many.")
580 # get the characters to be used for escaping
581 marker: int = 33
582 for i, esc in enumerate(forbidden):
583 while True:
584 cmarker: str = chr(marker)
585 marker += 1
586 if cmarker not in charset:
587 break
588 forbidden[i] = [esc, cmarker, False] # type: ignore
589 charset.add(cmarker)
591 # perform the escaping
592 last: int = 0
593 used: list[tuple[str, str]] = []
594 forbidden_len -= 1
595 while forbidden_len >= 0:
596 first: int = 1_073_741_825
597 ft: list[Any] | None = None
598 for i in range(forbidden_len, -1, -1):
599 ftx = forbidden[i]
600 index = text.find(ftx[0], last)
601 if index >= last:
602 if index < first:
603 ft = ftx
604 first = index
605 else:
606 del forbidden[i]
607 forbidden_len -= 1
608 if (first < 0) or (not ft):
609 break
611 # This form of replacement of subsequences is inefficient.
612 text = str.replace(text, ft[0], ft[1], 1) # Must be first occurence...
613 # f"{text[:first]}{p2}{text[first + str.__len__(p1):]}" # noqa
614 last = first + str.__len__(ft[1])
615 if ft[2]:
616 continue
617 used.append((ft[0], ft[1]))
618 ft[2] = True
620 used.sort()
621 return text, used
624def unescape(text: str, escapes: Any) -> str:
625 """
626 Revert the operation of the :func:`escape` function.
628 See the documentation of the function :func:`escape`.
630 :param text: the text
631 :param escapes: the substrings to escape
632 :return: a tuple of an escaped version of the string, together with
633 the escape sequences.
635 >>> s, v = escape('2345123123^21123z41vvvbH34Zxgo493244747261',
636 ... ("1", "11", "45", "v", "vb", "47", "61", "H3"))
637 >>> print(s)
638 23$!23!23^2#23z4!""('4Zxgo49324%%2&
639 >>> print(v)
640 [('1', '!'), ('11', '#'), ('45', '$'), ('47', '%'), ('61', '&'), \
641('H3', "'"), ('v', '"'), ('vb', '(')]
643 >>> unescape(s, v)
644 '2345123123^21123z41vvvbH34Zxgo493244747261'
646 >>> s, v = escape('23451"23123^2112$3z41#vvvb!H34Zxgo4932%44747261',
647 ... ("1", "11", "45", "v", "vb", "47", "61", "H3"))
648 >>> print(s)
649 23)&"23&23^2(2$3z4&#''-!,4Zxgo4932%4**2+
650 >>> print(v)
651 [('1', '&'), ('11', '('), ('45', ')'), ('47', '*'), ('61', '+'), \
652('H3', ','), ('v', "'"), ('vb', '-')]
654 >>> unescape(s, v)
655 '23451"23123^2112$3z41#vvvb!H34Zxgo4932%44747261'
657 >>> unescape("", [("a", "b"), ])
658 ''
660 >>> unescape("b", [("a", "b"), ])
661 'a'
663 >>> unescape("b", None)
664 'b'
666 >>> unescape("b", [])
667 'b'
669 >>> try:
670 ... unescape("1" * 1_073_741_825, [("1", "2"), ])
671 ... except ValueError as ve:
672 ... print(ve)
673 We rather not unescape a string with 1073741825 characters.
674 """
675 text_len: Final[int] = str.__len__(text)
676 if (text_len <= 0) or (escapes is None) or (
677 list.__len__(escapes) <= 0):
678 return text
679 if text_len > 1_073_741_824:
680 raise ValueError(
681 f"We rather not unescape a string with {text_len} characters.")
683 # perform the un-escaping
684 for orig, repl in escapes:
685 text = str.replace(text, repl, orig)
687 return text