Coverage for pycommons/strings/string

1"""Routines for handling strings."""

3from re import Match, subn

4from re import compile as _compile

5from typing import Any, Callable, Final, Generator, Iterable, Pattern, cast

7from pycommons.types import type_error

9#: fast call to :meth:`str.__len__`

10__LEN: Final[Callable[[str], int]] = cast("Callable[[str], int]", str.__len__)

13def replace_str(find: str, replace: str, src: str) -> str:

14 """

15 Perform a recursive replacement of strings.

17 After applying this function, there will not be any occurence of `find`

18 left in `src`. All of them will have been replaced by `replace`. If that

19 produces new instances of `find`, these will be replaced as well

20 *unless they do not make the string shorter*. In other words, the

21 replacement is continued only if the new string becomes shorter.

23 See :func:`replace_regex` for regular-expression based replacements.

25 :param find: the string to find

26 :param replace: the string with which it will be replaced

27 :param src: the string in which we search

28 :returns: the string `src`, with all occurrences of find replaced by

29 replace

30 :raises TypeError: if any of the parameters are not strings

32 >>> replace_str("a", "b", "abc")

33 'bbc'

34 >>> replace_str("aa", "a", "aaaaa")

35 'a'

36 >>> replace_str("aba", "a", "abaababa")

37 'aa'

38 >>> replace_str("aba", "aba", "abaababa")

39 'abaababa'

40 >>> replace_str("aa", "aa", "aaaaaaaa")

41 'aaaaaaaa'

42 >>> replace_str("a", "aa", "aaaaaaaa")

43 'aaaaaaaaaaaaaaaa'

44 >>> replace_str("a", "xx", "aaaaaaaa")

45 'xxxxxxxxxxxxxxxx'

47 >>> try:

48 ... replace_str(None, "a", "b")

49 ... except TypeError as te:

50 ... print(te)

51 replace() argument 1 must be str, not None

53 >>> try:

54 ... replace_str(1, "a", "b")

55 ... except TypeError as te:

56 ... print(te)

57 replace() argument 1 must be str, not int

59 >>> try:

60 ... replace_str("a", None, "b")

61 ... except TypeError as te:

62 ... print(te)

63 replace() argument 2 must be str, not None

65 >>> try:

66 ... replace_str("x", 1, "b")

67 ... except TypeError as te:

68 ... print(te)

69 replace() argument 2 must be str, not int

71 >>> try:

72 ... replace_str("a", "v", None)

73 ... except TypeError as te:

74 ... print(te)

75 descriptor '__len__' requires a 'str' object but received a 'NoneType'

77 >>> try:

78 ... replace_str("x", "xy", 1)

79 ... except TypeError as te:

80 ... print(te)

81 descriptor '__len__' requires a 'str' object but received a 'int'

82 """

83 new_len: int = __LEN(src)

84 while True:

85 src = src.replace(find, replace)

86 old_len: int = new_len

87 new_len = __LEN(src)

88 if new_len >= old_len:

89 return src

92def replace_regex(search: str | Pattern,

93 replace: str | Callable[[Match], str],

94 inside: str) -> str:

95 r"""

96 Replace all occurrences of 'search' in 'inside' with 'replace'.

98 This replacement procedure is done repetitively and recursively until

99 no occurrence of `search` is found anymore. This, of course, may lead

100 to an endless loop, so a `ValueError` is thrown if there are too many

101 recursive replacements.

102

103 :param search: the regular expression to search, either a string or a

104 pattern

105 :param replace: the string to replace it with, or a function receiving

106 a :class:`re.Match` instance and returning a replacement string

107 :param inside: the string in which to search/replace

108 :returns: the new string after the recursive replacement

109 :raises TypeError: if any of the parameters is not of the right type

110 :raises ValueError: if there are 100000 recursive replacements or more,

111 indicating that there could be an endless loop

112

113 >>> replace_regex('[ \t]+\n', '\n', ' bla \nxyz\tabc\t\n')

114 ' bla\nxyz\tabc\n'

115 >>> replace_regex('[0-9]A', 'X', '23A7AA')

116 '2XXA'

117 >>> from re import compile as cpx

118 >>> replace_regex(cpx('[0-9]A'), 'X', '23A7AA')

119 '2XXA'

120

121 >>> def __repl(a):

122 ... print(repr(a))

123 ... return "y"

124 >>> replace_regex("a.b", __repl, "albaab")

125 <re.Match object; span=(0, 3), match='alb'>

126 <re.Match object; span=(3, 6), match='aab'>

127 'yy'

128

129 >>> def __repl(a):

130 ... print(repr(a))

131 ... ss = a.group()

132 ... print(ss)

133 ... return "axb"

134 >>> replace_regex("aa.bb", __repl, "aaaaaxbbbbb")

135 <re.Match object; span=(3, 8), match='aaxbb'>

136 aaxbb

137 <re.Match object; span=(2, 7), match='aaxbb'>

138 aaxbb

139 <re.Match object; span=(1, 6), match='aaxbb'>

140 aaxbb

141 <re.Match object; span=(0, 5), match='aaxbb'>

142 aaxbb

143 'axb'

144

145 >>> replace_regex("aa.bb", "axb", "aaaaaxbbbbb")

146 'axb'

147 >>> replace_regex("aa.bb", "axb", "".join("a" * 100 + "y" + "b" * 100))

148 'axb'

149 >>> replace_regex("aa.bb", "axb",

150 ... "".join("a" * 10000 + "y" + "b" * 10000))

151 'axb'

152

153 >>> try:

154 ... replace_regex(1, "1", "2")

155 ... except TypeError as te:

156 ... print(str(te)[0:60])

157 search should be an instance of any in {str, typing.Pattern}

158

159 >>> try:

160 ... replace_regex(None, "1", "2")

161 ... except TypeError as te:

162 ... print(te)

163 search should be an instance of any in {str, typing.Pattern} but is None.

164

165 >>> try:

166 ... replace_regex("x", 2, "2")

167 ... except TypeError as te:

168 ... print(te)

169 replace should be an instance of str or a callable but is int, namely 2.

170

171 >>> try:

172 ... replace_regex("x", None, "2")

173 ... except TypeError as te:

174 ... print(te)

175 replace should be an instance of str or a callable but is None.

176

177 >>> try:

178 ... replace_regex(1, 1, "2")

179 ... except TypeError as te:

180 ... print(str(te)[0:60])

181 search should be an instance of any in {str, typing.Pattern}

182

183 >>> try:

184 ... replace_regex("yy", "1", 3)

185 ... except TypeError as te:

186 ... print(te)

187 inside should be an instance of str but is int, namely 3.

188

189 >>> try:

190 ... replace_regex("adad", "1", None)

191 ... except TypeError as te:

192 ... print(te)

193 inside should be an instance of str but is None.

194

195 >>> try:

196 ... replace_regex(1, "1", 3)

197 ... except TypeError as te:

198 ... print(str(te)[0:60])

199 search should be an instance of any in {str, typing.Pattern}

200

201 >>> try:

202 ... replace_regex(1, 3, 5)

203 ... except TypeError as te:

204 ... print(str(te)[0:60])

205 search should be an instance of any in {str, typing.Pattern}

206

207 >>> try:

209 ... "ababababab")

210 ... except ValueError as ve:

211 ... print(str(ve)[:50])

212 Too many replacements, pattern re.compile('abab|ba

213 """

214 if not isinstance(search, Pattern):

215 if isinstance(search, str):

216 search = _compile(search)

217 else:

218 raise type_error(search, "search", (str, Pattern))

219 if not (isinstance(replace, str) or callable(replace)):

220 raise type_error(replace, "replace", str, call=True)

221 if not isinstance(inside, str):

222 raise type_error(inside, "inside", str)

223 for _ in range(100_000):

224 text, n = subn(pattern=search, repl=replace, string=inside)

225 if (n <= 0) or (text == inside):

226 return text

227 inside = text

228 raise ValueError(

229 f"Too many replacements, pattern {search!r} probably malformed for "

230 f"text {inside!r} and replacement {replace!r}.")

231

232

233def get_prefix_str(strings: str | Iterable[str]) -> str:

234 r"""

235 Compute the common prefix of an iterable of strings.

236

237 :param strings: the iterable of strings

238 :returns: the common prefix

239 :raises TypeError: if the input is not a string, iterable of string,

240 or contains any non-string element (before the prefix is determined)

241 Notice: If the prefix is determined as the empty string, then the

242 search is stopped. If some non-`str` items follow later in `strings`,

243 then these may not raise a `TypeError`

244

245 >>> get_prefix_str(["abc", "acd"])

246 'a'

247 >>> get_prefix_str(["xyz", "gsdf"])

248 ''

249 >>> get_prefix_str([])

250 ''

251 >>> get_prefix_str(["abx"])

252 'abx'

253 >>> get_prefix_str(("abx", ))

254 'abx'

255 >>> get_prefix_str({"abx", })

256 'abx'

257 >>> get_prefix_str("abx")

258 'abx'

259 >>> get_prefix_str(("\\relative.path", "\\relative.figure",

260 ... "\\relative.code"))

261 '\\relative.'

262 >>> get_prefix_str({"\\relative.path", "\\relative.figure",

263 ... "\\relative.code"})

264 '\\relative.'

265

266 >>> try:

267 ... get_prefix_str(None)

268 ... except TypeError as te:

269 ... print(te)

270 strings should be an instance of any in {str, typing.Iterable} but is None.

271

272 >>> try:

273 ... get_prefix_str(1)

274 ... except TypeError as te:

275 ... print(str(te)[:60])

276 strings should be an instance of any in {str, typing.Iterabl

277

278 >>> try:

279 ... get_prefix_str(["abc", "acd", 2, "x"])

280 ... except TypeError as te:

281 ... print(te)

282 descriptor '__len__' requires a 'str' object but received a 'int'

283

284 >>> try:

285 ... get_prefix_str(["abc", "acd", None, "x"])

286 ... except TypeError as te:

287 ... print(te)

288 descriptor '__len__' requires a 'str' object but received a 'NoneType'

289

290 >>> get_prefix_str(["xyz", "gsdf", 5])

291 ''

292 """

293 if isinstance(strings, str):

294 return strings

295 if not isinstance(strings, Iterable):

296 raise type_error(strings, "strings", (str, Iterable))

297 prefix: str | None = None

298 prefix_len: int = -1

299 for current in strings: # iterate over all strings

300 current_len: int = __LEN(current)

301 if prefix is None: # no prefix set yet

302 prefix = current

303 prefix_len = current_len

304 else: # we got a prefix

305 for i in range(min(prefix_len, current_len)):

306 if prefix[i] != current[i]:

307 prefix_len = i

308 break

309 if prefix_len == 0:

310 return ""

311 return "" if prefix is None else prefix[0:prefix_len]

312

313

314def split_str(source: str, split_by: str) -> Generator[str, None, None]:

315 """

316 Split a string by the given other string.

317

318 The goal is to provide a less memory intense variant of the method

319 :meth:`str.split`. This routine should iteratively divide a given string

320 based on a splitting character or string. This function may be useful if

321 we are dealing with a very big `source` string and want to iteratively

322 split it into smaller strings. Instead of creating a list with many small

323 strings, what :meth:`str.split` does, it creates these strings

324 iteratively

325

326 :param source: the source string

327 :param split_by: the split string

328 :returns: each split element

329

330 >>> list(split_str("", ""))

331 ['']

332

333 >>> list(split_str("", "x"))

334 ['']

335

336 >>> list(split_str("a", ""))

337 ['a']

338

339 >>> list(split_str("abc", ""))

340 ['a', 'b', 'c']

341

342 >>> list(split_str("a;b;c", ";"))

343 ['a', 'b', 'c']

344

345 >>> list(split_str("a;b;c;", ";"))

346 ['a', 'b', 'c', '']

347

348 >>> list(split_str(";a;b;;c;", ";"))

349 ['', 'a', 'b', '', 'c', '']

350

351 >>> list(split_str("a;aaa;aba;aa;aca;a", "a;a"))

352 ['', 'a', 'b', '', 'c', '']

353 """

354 src_len: Final[int] = str.__len__(source)

355 if src_len <= 0: # handle empty input strings

356 yield "" # the source is empty, so the split is empty, too

357 return # quit after returning the empty string

358

359 split_len: Final[int] = str.__len__(split_by)

360 if split_len <= 0: # handle empty split patterns

361 yield from source # if the split is empty, we return each character

362 return # and quit

363

364 start: int = 0 # now we work for non-empty split patterns

365 finder: Final[Callable[[str, int], int]] = source.find # fast call

366 while start < src_len:

367 end = finder(split_by, start)

368 if end < 0:

369 yield source[start:] if start > 0 else source

370 return # pattern not found anymore, so we quit

371 yield source[start:end]

372 start = end + split_len

373 yield "" # pattern found at the end of the string

374

375

376def escape(text: str, escapes: Iterable[str]) -> tuple[str, Any]:

377 """

378 Escapes a set of substrings inside a string in a reversible manner.

379

380 A set of character sequences (`escapes`) are to be removed from

381 `text` and to be replaced with characters that do not occur inside

382 `text`. Escaping is a bijection. Since all escaped sequences are replaced

383 with characters that are new to the string, there cannot be any issue with

384 recursively occuring patterns or ambigiuties.

385

386 Replacement is performed iteratively from beginning to end. The first

387 sequence from `escapes` that is discovered is replaced and then the

388 process continues. If two sequences start at the same place, then the

389 longer one is replaced first.

390

391 The same `text` with the same set of escapes will always produce the same

392 output, regardless of the order of the escapes.

393

394 The function returns a tuple containing the escaped string as well as the

395 setup used for the escaping (as the second element). This second element

396 must *only* be used by the function :func:`unescape`, which is the reverse

397 operator of :func:`escape`. You must not make any assumption about its

398 nature.

399

400 :param text: the text

401 :param escapes: the substrings to escape

402 :return: a tuple of an escaped version of the string, together with

403 the escape information.

404 The second part of the tuple must not be accessed.

405

406 >>> s, v = escape("12345", ("12", "X", "5"))

407 >>> print(s)

408 "34!

409 >>> print(v)

410 [('12', '"'), ('5', '!')]

411

412 >>> unescape(s, v)

413 '12345'

414

415 >>> s, v = escape('"123!45', ("12", "X", "5", "!", '"'))

416 >>> print(s)

417 $&3#4%

418 >>> print(v)

419 [('!', '#'), ('"', '$'), ('12', '&'), ('5', '%')]

420

421 >>> unescape(s, v)

422 '"123!45'

423

424 >>> s, v = escape('"123!45', ("X", "5", "12", "!", '"'))

425 >>> print(s)

426 $&3#4%

427 >>> print(v)

428 [('!', '#'), ('"', '$'), ('12', '&'), ('5', '%')]

429

430 >>> unescape(s, v)

431 '"123!45'

432

433 >>> s, v = escape('111111112222233321111212121',

434 ... ("1", "11", "2", "222", "1", "32", "321", "21", "33"))

435 >>> print(s)

436 ####'""&(#!$$$

437 >>> print(v)

438 [('1', '!'), ('11', '#'), ('2', '"'), ('21', '$'), ('222', "'"), \

439('321', '('), ('33', '&')]

440

441 >>> unescape(s, v)

442 '111111112222233321111212121'

443

444 >>> s, v = escape('111&111112222233321111212X121',

445 ... ("1", "11", "2", "222", "1", "32", "321", "21", "33"))

446 >>> print(s)

447 #!&##!(""')#!$"X!$

448 >>> print(v)

449 [('1', '!'), ('11', '#'), ('2', '"'), ('21', '$'), ('222', '('), \

450('321', ')'), ('33', "'")]

451

452 >>> unescape(s, v)

453 '111&111112222233321111212X121'

454

455 >>> s, v = escape('221', ("22", "21"))

456 >>> print(s)

457 "1

458 >>> print(v)

459 [('22', '"')]

460

461 >>> s, v = escape('22221', ("2222", "2221", "22", "21"))

462 >>> print(s)

463 $1

464 >>> print(v)

465 [('2222', '$')]

466

467 >>> unescape(s, v)

468 '22221'

469

470 >>> s, v = escape('222212222122221', ("2222", "2221", "22", "21"))

471 >>> print(s)

472 $1$1$1

473 >>> print(v)

474 [('2222', '$')]

475

476 >>> unescape(s, v)

477 '222212222122221'

478

479 >>> s, v = escape('222212222122221', ("2222", "2221", "22", "21", '"1'))

480 >>> print(s)

481 %1%1%1

482 >>> print(v)

483 [('2222', '%')]

484

485 >>> unescape(s, v)

486 '222212222122221'

487

488 >>> try:

489 ... escape(1, None)

490 ... except TypeError as te:

491 ... print(te)

492 descriptor '__len__' requires a 'str' object but received a 'int'

493

494 >>> try:

495 ... escape("x", 5)

496 ... except TypeError as te:

497 ... print(te)

498 'int' object is not iterable

499

500 >>> try:

501 ... escape("x", (5, ))

502 ... except TypeError as te:

503 ... print(te)

504 descriptor '__len__' requires a 'str' object but received a 'int'

505

506 >>> s, v = escape("", ("12", ))

507 >>> s == ""

508 True

509 >>> v is None

510 True

511

512 >>> s, v = escape("12", [])

513 >>> s == "12"

514 True

515 >>> v is None

516 True

517

518 >>> s, v = escape("12", ["3", "4", "5"])

519 >>> s == "12"

520 True

521 >>> v is None

522 True

523

524 >>> try:

525 ... s, v = escape("1" * 1_073_741_826, ("x", ))

526 ... except ValueError as ve:

527 ... print(ve)

528 We rather not escape a string with 1073741826 characters.

529

530 >>> try:

531 ... s, v = escape("".join(chr(i) for i in range(524_290)), ("x", ))

532 ... except ValueError as ve:

533 ... print(ve)

534 524290 different characters and 1 escapes are too many.

535

536 >>> try:

537 ... s, v = escape("123", ("x", ""))

538 ... except ValueError as ve:

539 ... print(ve)

540 Cannot escape empty string.

541 """

542 text_len: Final[int] = str.__len__(text)

543 if text_len <= 0:

544 return "", None

545 if text_len > 1_073_741_824:

546 raise ValueError(

547 f"We rather not escape a string with {text_len} characters.")

548

549 # check which of the escapes are actually needed

550 forbidden: Final[list[Any]] = []

551 charset: set[str] = set()

552 needs_escaping: bool = False

553 for fb in escapes:

554 if str.__len__(fb) <= 0:

555 raise ValueError("Cannot escape empty string.")

556 if fb in forbidden:

557 continue

558 charset.update(fb)

559 if fb in text:

560 forbidden.append(fb)

561 needs_escaping = True

562

563 forbidden_len: int = list.__len__(forbidden)

564 if (not needs_escaping) or (forbidden_len <= 0):

565 return text, None

566

567 # always create the same escape sequences

568 forbidden.sort()

569 # make sure to escape long sequences first

570 forbidden.sort(key=str.__len__) # type: ignore

571

572 # get the set of all characters in this string

573 charset.update(text)

574 char_len: Final[int] = set.__len__(charset)

575 if (char_len + forbidden_len) > 524_288:

576 raise ValueError(

577 f"{char_len} different characters and "

578 f"{forbidden_len} escapes are too many.")

579

580 # get the characters to be used for escaping

581 marker: int = 33

582 for i, esc in enumerate(forbidden):

583 while True:

584 cmarker: str = chr(marker)

585 marker += 1

586 if cmarker not in charset:

587 break

588 forbidden[i] = [esc, cmarker, False] # type: ignore

589 charset.add(cmarker)

590

591 # perform the escaping

592 last: int = 0

593 used: list[tuple[str, str]] = []

594 forbidden_len -= 1

595 while forbidden_len >= 0:

596 first: int = 1_073_741_825

597 ft: list[Any] | None = None

598 for i in range(forbidden_len, -1, -1):

599 ftx = forbidden[i]

600 index = text.find(ftx[0], last)

601 if index >= last:

602 if index < first:

603 ft = ftx

604 first = index

605 else:

606 del forbidden[i]

607 forbidden_len -= 1

608 if (first < 0) or (not ft):

609 break

610

611 # This form of replacement of subsequences is inefficient.

612 text = str.replace(text, ft[0], ft[1], 1) # Must be first occurence...

613 # f"{text[:first]}{p2}{text[first + str.__len__(p1):]}" # noqa

614 last = first + str.__len__(ft[1])

615 if ft[2]:

616 continue

617 used.append((ft[0], ft[1]))

618 ft[2] = True

619

620 used.sort()

621 return text, used

622

623

624def unescape(text: str, escapes: Any) -> str:

625 """

626 Revert the operation of the :func:`escape` function.

627

628 See the documentation of the function :func:`escape`.

629

630 :param text: the text

631 :param escapes: the substrings to escape

632 :return: a tuple of an escaped version of the string, together with

633 the escape sequences.

634

635 >>> s, v = escape('2345123123^21123z41vvvbH34Zxgo493244747261',

636 ... ("1", "11", "45", "v", "vb", "47", "61", "H3"))

637 >>> print(s)

638 23$!23!23^2#23z4!""('4Zxgo49324%%2&

639 >>> print(v)

640 [('1', '!'), ('11', '#'), ('45', '$'), ('47', '%'), ('61', '&'), \

641('H3', "'"), ('v', '"'), ('vb', '(')]

642

643 >>> unescape(s, v)

644 '2345123123^21123z41vvvbH34Zxgo493244747261'

645

646 >>> s, v = escape('23451"23123^2112$3z41#vvvb!H34Zxgo4932%44747261',

647 ... ("1", "11", "45", "v", "vb", "47", "61", "H3"))

648 >>> print(s)

649 23)&"23&23^2(2$3z4&#''-!,4Zxgo4932%4**2+

650 >>> print(v)

651 [('1', '&'), ('11', '('), ('45', ')'), ('47', '*'), ('61', '+'), \

652('H3', ','), ('v', "'"), ('vb', '-')]

653

654 >>> unescape(s, v)

655 '23451"23123^2112$3z41#vvvb!H34Zxgo4932%44747261'

656

657 >>> unescape("", [("a", "b"), ])

658 ''

659

660 >>> unescape("b", [("a", "b"), ])

661 'a'

662

663 >>> unescape("b", None)

664 'b'

665

666 >>> unescape("b", [])

667 'b'

668

669 >>> try:

670 ... unescape("1" * 1_073_741_825, [("1", "2"), ])

671 ... except ValueError as ve:

672 ... print(ve)

673 We rather not unescape a string with 1073741825 characters.

674 """

675 text_len: Final[int] = str.__len__(text)

676 if (text_len <= 0) or (escapes is None) or (

677 list.__len__(escapes) <= 0):

678 return text

679 if text_len > 1_073_741_824:

680 raise ValueError(

681 f"We rather not unescape a string with {text_len} characters.")

682

683 # perform the un-escaping

684 for orig, repl in escapes:

685 text = str.replace(text, repl, orig)

686

687 return text

Coverage for pycommons / strings / string_tools.py: 100%

136 statements