Coverage for pycommons / dev / url_replacer.py: 100%
84 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
1"""
2Process a markdown file in order to make it useful for distribution.
4In order to let sphinx properly load and insert the `README.md` file into the
5project's documentation, we need to process this file from the GitHub style
6markdown to a variant suitable for the myst parser used in sphinx. While we
7are at it, we can also turn absolute URLs from the GitHub-`README.md` file
8that point to the documentation URL to relative URLs.
9"""
11from itertools import chain
12from re import Match, Pattern, escape, subn
13from re import compile as re_compile
14from typing import Any, Callable, Final, Iterable, Mapping, cast
16from pycommons.net.url import URL
17from pycommons.types import type_error
19#: the separators for URLs in html
20__HTML_URL_SEPARTORS: Final[tuple[tuple[str, str, str, str], ...]] = (
21 (r'\s+href\s*=\s*"\s*', r'\s*"', ' href="', '"'),
22 (r"\s+href\s*=\s*'\s*", r"\s*'", ' href="', '"'),
23 (r'\s+src\s*=\s*"\s*', r'\s*"', ' src="', '"'),
24 (r"\s+src\s*=\s*'\s*", r"\s*'", ' src="', '"'),
25)
27#: the separators for URLs in markdown
28__MD_URL_SEPARTORS: Final[tuple[tuple[str, str, str, str], ...]] = (
29 (r"\s*\]\s*\(\s*", r"\s*\)", r"](", r")"),
30 (r"<\s*", r"\s*>", "<", ">"),
31)
34def __make_base_url_replacer(
35 collector: Callable[[
36 tuple[Pattern, Callable[[Match], str] | str]], Any],
37 base_url_to_replace: str,
38 replace_base_url_with: str = "./",
39 for_markdown: bool = True) -> None:
40 r"""
41 Make `(Pattern, Callable)` tuples that replace base URLs in Markdown.
43 :param collector: the collector (e.g. :meth:`list.append`) to receive the
44 tuple
45 :param base_url_to_replace: the base url to be replaced
46 :param replace_base_url_with: the string with which the base URL should be
47 replaced
48 :param for_markdown: should replacers for Markdown be add (`True`) or only
49 for HTML (`False`)
51 >>> from re import sub
52 >>> coll = list()
53 >>> __make_base_url_replacer(
54 ... coll.append, "https://example.com/x", "./")
55 >>> for k, y in coll:
56 ... print(repr(k.pattern))
57 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
58 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
59 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
60 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
61 '\\s*\\]\\s*\\(\\s*https?://example\\.com/x\\/?(.*)\\s*\\)'
62 '<\\s*https?://example\\.com/x\\/?(.*)\\s*>'
63 >>> sub(coll[3][0], coll[3][1], " src= ' https://example.com/x/y '")
64 ' src="./y"'
65 >>> sub(coll[1][0], coll[1][1], " href ='http://example.com/x/y '")
66 ' href="./y"'
67 >>> sub(coll[2][0], coll[2][1], ' src ="https://example.com/x/y"')
68 ' src="./y"'
69 >>> sub(coll[0][0], coll[0][1], ' href ="http://example.com/x/y"')
70 ' href="./y"'
71 >>> sub(coll[4][0], coll[4][1], '[l]( https://example.com/x/y)')
72 '[l](./y)'
73 >>> sub(coll[4][0], coll[4][1], '![xx ] (http://example.com/x/y/g.jpg)')
74 ''
75 >>> sub(coll[5][0], coll[5][1], '< https://example.com/x/y >')
76 '<./y>'
77 >>> sub(coll[3][0], coll[3][1], "src='https://example.com/x/y ")
78 "src='https://example.com/x/y "
80 >>> coll = list()
81 >>> __make_base_url_replacer(
82 ... coll.append, "https://example.com/x/", "./")
83 >>> for k, y in coll:
84 ... print(repr(k.pattern))
85 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
86 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
87 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
88 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
89 '\\s*\\]\\s*\\(\\s*https?://example\\.com/x\\/?(.*)\\s*\\)'
90 '<\\s*https?://example\\.com/x\\/?(.*)\\s*>'
92 >>> coll = list()
93 >>> __make_base_url_replacer(
94 ... coll.append, "https://example.com/x/", "./", False)
95 >>> for k, y in coll:
96 ... print(repr(k.pattern))
97 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
98 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
99 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
100 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
102 >>> coll = list()
103 >>> __make_base_url_replacer(
104 ... coll.append, "https://example.com/x/", "/")
105 >>> for k, y in coll:
106 ... print(repr(k.pattern))
107 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
108 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
109 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
110 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
111 '\\s*\\]\\s*\\(\\s*https?://example\\.com/x\\/?(.*)\\s*\\)'
112 '<\\s*https?://example\\.com/x\\/?(.*)\\s*>'
114 >>> coll = list()
115 >>> __make_base_url_replacer(
116 ... coll.append, "https://example.com/x/", "/", False)
117 >>> for k, y in coll:
118 ... print(repr(k.pattern))
119 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
120 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
121 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
122 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
124 >>> coll = list()
125 >>> __make_base_url_replacer(
126 ... coll.append, "https://example.com/x/", "bb")
127 >>> for k, y in coll:
128 ... print(repr(k.pattern))
129 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x(.*)\\s*"'
130 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x(.*)\\s*'"
131 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x(.*)\\s*"'
132 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x(.*)\\s*'"
133 '\\s*\\]\\s*\\(\\s*https?://example\\.com/x(.*)\\s*\\)'
134 '<\\s*https?://example\\.com/x(.*)\\s*>'
136 >>> coll = list()
137 >>> __make_base_url_replacer(
138 ... coll.append, "https://example.com/x/", "bb", False)
139 >>> for k, y in coll:
140 ... print(repr(k.pattern))
141 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x(.*)\\s*"'
142 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x(.*)\\s*'"
143 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x(.*)\\s*"'
144 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x(.*)\\s*'"
146 >>> coll = list()
147 >>> __make_base_url_replacer(
148 ... coll.append, "https://example.com/x", "./")
149 >>> for k, y in coll:
150 ... print(repr(k.pattern))
151 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
152 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
153 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
154 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
155 '\\s*\\]\\s*\\(\\s*https?://example\\.com/x\\/?(.*)\\s*\\)'
156 '<\\s*https?://example\\.com/x\\/?(.*)\\s*>'
158 >>> coll = list()
159 >>> __make_base_url_replacer(
160 ... coll.append, "https://example.com/x", "./", False)
161 >>> for k, y in coll:
162 ... print(repr(k.pattern))
163 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
164 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
165 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
166 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
168 >>> coll = list()
169 >>> __make_base_url_replacer(
170 ... coll.append, "https://example.com/x", "/")
171 >>> for k, y in coll:
172 ... print(repr(k.pattern))
173 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
174 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
175 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
176 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
177 '\\s*\\]\\s*\\(\\s*https?://example\\.com/x\\/?(.*)\\s*\\)'
178 '<\\s*https?://example\\.com/x\\/?(.*)\\s*>'
180 >>> coll = list()
181 >>> __make_base_url_replacer(
182 ... coll.append, "https://example.com/x", "/", False)
183 >>> for k, y in coll:
184 ... print(repr(k.pattern))
185 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
186 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
187 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\/?(.*)\\s*"'
188 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\/?(.*)\\s*'"
190 >>> coll = list()
191 >>> __make_base_url_replacer(
192 ... coll.append, "https://example.com/x", "bb")
193 >>> for k, y in coll:
194 ... print(repr(k.pattern))
195 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x(.*)\\s*"'
196 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x(.*)\\s*'"
197 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x(.*)\\s*"'
198 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x(.*)\\s*'"
199 '\\s*\\]\\s*\\(\\s*https?://example\\.com/x(.*)\\s*\\)'
200 '<\\s*https?://example\\.com/x(.*)\\s*>'
202 >>> coll = list()
203 >>> __make_base_url_replacer(
204 ... coll.append, "https://example.com/x", "bb", False)
205 >>> for k, y in coll:
206 ... print(repr(k.pattern))
207 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x(.*)\\s*"'
208 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x(.*)\\s*'"
209 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x(.*)\\s*"'
210 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x(.*)\\s*'"
212 >>> try:
213 ... __make_base_url_replacer(None, "https://example.com/x", "bb")
214 ... except TypeError as te:
215 ... print(te)
216 collector should be a callable but is None.
218 >>> try:
219 ... __make_base_url_replacer(1, "https://example.com/x", "bb")
220 ... except TypeError as te:
221 ... print(te)
222 collector should be a callable but is int, namely 1.
224 >>> try:
225 ... __make_base_url_replacer(coll.append, None, "bb")
226 ... except TypeError as te:
227 ... print(te)
228 descriptor '__len__' requires a 'str' object but received a 'NoneType'
230 >>> try:
231 ... __make_base_url_replacer(coll.append, 1, "bb")
232 ... except TypeError as te:
233 ... print(te)
234 descriptor '__len__' requires a 'str' object but received a 'int'
236 >>> try:
237 ... __make_base_url_replacer(
238 ... coll.append, "https://example.com/x", None)
239 ... except TypeError as te:
240 ... print(te)
241 descriptor 'strip' for 'str' objects doesn't apply to a 'NoneType' object
243 >>> try:
244 ... __make_base_url_replacer(coll.append, "https://example.com/x", 1)
245 ... except TypeError as te:
246 ... print(te)
247 descriptor 'strip' for 'str' objects doesn't apply to a 'int' object
249 >>> try:
250 ... __make_base_url_replacer(coll.append, "tweise@hfuu.edu.cn", "x")
251 ... except ValueError as ve:
252 ... print(ve)
253 Invalid scheme 'mailto' for url 'tweise@hfuu.edu.cn'.
255 >>> try:
256 ... __make_base_url_replacer(coll.append, "https://example.com/x",
257 ... "./", None)
258 ... except TypeError as te:
259 ... print(te)
260 for_markdown should be an instance of bool but is None.
262 >>> try:
263 ... __make_base_url_replacer(coll.append, "https://example.com/x",
264 ... "./", 1)
265 ... except TypeError as te:
266 ... print(te)
267 for_markdown should be an instance of bool but is int, namely 1.
268 """
269 if not callable(collector):
270 raise type_error(collector, "collector", call=True)
271 if not isinstance(for_markdown, bool):
272 raise type_error(for_markdown, "for_markdown", bool)
273 url: Final[URL] = URL(base_url_to_replace)
274 if url.scheme not in {"http", "https"}:
275 raise ValueError(
276 f"Invalid scheme {url.scheme!r} for url {base_url_to_replace!r}.")
277 use_repl: Final[str] = str.strip(replace_base_url_with)
279 use_url: str = f"https?{escape(url[str.__len__(url.scheme):])}"
280 if use_repl.startswith(("/", "./")):
281 use_url += "?" if use_url.endswith("/") else r"\/?"
283 for sr, er, ss, es in chain(__HTML_URL_SEPARTORS, __MD_URL_SEPARTORS) \
284 if for_markdown else __HTML_URL_SEPARTORS:
285 collector((
286 cast("Pattern", re_compile(f"{sr}{use_url}(.*){er}")),
287 cast("Callable[[Match], str]",
288 lambda mm, _ss=ss, _es=es, _rr=use_repl:
289 f"{_ss}{_rr}{str.strip(mm.group(1))}{_es}")))
292def __make_full_url_replacer(
293 collector: Callable[[
294 tuple[Pattern, Callable[[Match], str] | str]], Any],
295 url_to_replace: str, replace_url_with: str = "./",
296 for_markdown: bool = True) -> None:
297 r"""
298 Make `(Pattern, Callable)` tuples that replace full URLs in Markdown.
300 :param collector: the collector (e.g. :meth:`list.append`) to receive the
301 tuple
302 :param url_to_replace: the full url to be replaced
303 :param replace_url_with: the string with which the URL should be
304 replaced
305 :param for_markdown: should replacers for Markdown be add (`True`) or only
306 for HTML (`False`)
308 >>> from re import sub
309 >>> coll = list()
310 >>> __make_full_url_replacer(
311 ... coll.append, "https://example.com/x.jpg", "x.jpg")
312 >>> for k, y in coll:
313 ... print(repr(k.pattern))
314 '\\s+href\\s*=\\s*"\\s*https?://example\\.com/x\\.jpg\\s*"'
315 "\\s+href\\s*=\\s*'\\s*https?://example\\.com/x\\.jpg\\s*'"
316 '\\s+src\\s*=\\s*"\\s*https?://example\\.com/x\\.jpg\\s*"'
317 "\\s+src\\s*=\\s*'\\s*https?://example\\.com/x\\.jpg\\s*'"
318 '\\s*\\]\\s*\\(\\s*https?://example\\.com/x\\.jpg\\s*\\)'
319 '<\\s*https?://example\\.com/x\\.jpg\\s*>'
320 >>> sub(coll[1][0], coll[1][1], " href= ' https://example.com/x.jpg '")
321 ' href="x.jpg"'
322 >>> sub(coll[1][0], coll[1][1], " href='https://example.com/x.jpg '")
323 ' href="x.jpg"'
324 >>> sub(coll[2][0], coll[2][1], ' src="https://example.com/x.jpg"')
325 ' src="x.jpg"'
326 >>> sub(coll[2][0], coll[2][1], ' src="https://example.com/x.jpg"')
327 ' src="x.jpg"'
328 >>> sub(coll[4][0], coll[4][1], '[l]( https://example.com/x.jpg)')
329 '[l](x.jpg)'
330 >>> sub(coll[4][0], coll[4][1], '![xx ] (https://example.com/x.jpg)')
331 ''
332 >>> sub(coll[5][0], coll[5][1], '< https://example.com/x.jpg>')
333 '<x.jpg>'
335 >>> coll = list()
336 >>> __make_full_url_replacer(
337 ... coll.append, "https://example.com/", "./x")
338 >>> for k, y in coll:
339 ... print(repr(k.pattern))
340 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
341 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
342 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
343 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
344 '\\s*\\]\\s*\\(\\s*https?://example\\.com\\/?\\s*\\)'
345 '<\\s*https?://example\\.com\\/?\\s*>'
347 >>> coll = list()
348 >>> __make_full_url_replacer(
349 ... coll.append, "https://example.com/", "./x", False)
350 >>> for k, y in coll:
351 ... print(repr(k.pattern))
352 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
353 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
354 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
355 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
357 >>> coll = list()
358 >>> __make_full_url_replacer(
359 ... coll.append, "https://example.com/", "/x")
360 >>> for k, y in coll:
361 ... print(repr(k.pattern))
362 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
363 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
364 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
365 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
366 '\\s*\\]\\s*\\(\\s*https?://example\\.com\\/?\\s*\\)'
367 '<\\s*https?://example\\.com\\/?\\s*>'
369 >>> coll = list()
370 >>> __make_full_url_replacer(
371 ... coll.append, "https://example.com/", "/x", False)
372 >>> for k, y in coll:
373 ... print(repr(k.pattern))
374 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
375 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
376 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
377 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
379 >>> coll = list()
380 >>> __make_full_url_replacer(
381 ... coll.append, "https://example.com/", "bb")
382 >>> for k, y in coll:
383 ... print(repr(k.pattern))
384 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\s*"'
385 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\s*'"
386 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\s*"'
387 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\s*'"
388 '\\s*\\]\\s*\\(\\s*https?://example\\.com\\s*\\)'
389 '<\\s*https?://example\\.com\\s*>'
391 >>> coll = list()
392 >>> __make_full_url_replacer(
393 ... coll.append, "https://example.com/", "bb", False)
394 >>> for k, y in coll:
395 ... print(repr(k.pattern))
396 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\s*"'
397 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\s*'"
398 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\s*"'
399 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\s*'"
401 >>> coll = list()
402 >>> __make_full_url_replacer(
403 ... coll.append, "https://example.com", "./x")
404 >>> for k, y in coll:
405 ... print(repr(k.pattern))
406 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
407 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
408 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
409 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
410 '\\s*\\]\\s*\\(\\s*https?://example\\.com\\/?\\s*\\)'
411 '<\\s*https?://example\\.com\\/?\\s*>'
413 >>> coll = list()
414 >>> __make_full_url_replacer(
415 ... coll.append, "https://example.com", "./x", False)
416 >>> for k, y in coll:
417 ... print(repr(k.pattern))
418 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
419 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
420 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
421 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
423 >>> coll = list()
424 >>> __make_full_url_replacer(
425 ... coll.append, "https://example.com", "/x")
426 >>> for k, y in coll:
427 ... print(repr(k.pattern))
428 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
429 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
430 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
431 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
432 '\\s*\\]\\s*\\(\\s*https?://example\\.com\\/?\\s*\\)'
433 '<\\s*https?://example\\.com\\/?\\s*>'
435 >>> coll = list()
436 >>> __make_full_url_replacer(
437 ... coll.append, "https://example.com", "/x", False)
438 >>> for k, y in coll:
439 ... print(repr(k.pattern))
440 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
441 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
442 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\/?\\s*"'
443 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\/?\\s*'"
445 >>> coll = list()
446 >>> __make_full_url_replacer(
447 ... coll.append, "https://example.com", "bb")
448 >>> for k, y in coll:
449 ... print(repr(k.pattern))
450 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\s*"'
451 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\s*'"
452 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\s*"'
453 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\s*'"
454 '\\s*\\]\\s*\\(\\s*https?://example\\.com\\s*\\)'
455 '<\\s*https?://example\\.com\\s*>'
457 >>> coll = list()
458 >>> __make_full_url_replacer(
459 ... coll.append, "https://example.com", "bb", False)
460 >>> for k, y in coll:
461 ... print(repr(k.pattern))
462 '\\s+href\\s*=\\s*"\\s*https?://example\\.com\\s*"'
463 "\\s+href\\s*=\\s*'\\s*https?://example\\.com\\s*'"
464 '\\s+src\\s*=\\s*"\\s*https?://example\\.com\\s*"'
465 "\\s+src\\s*=\\s*'\\s*https?://example\\.com\\s*'"
467 >>> try:
468 ... __make_full_url_replacer(None, "https://example.com", "bb")
469 ... except TypeError as te:
470 ... print(te)
471 collector should be a callable but is None.
473 >>> try:
474 ... __make_full_url_replacer(1, "https://example.com", "bb")
475 ... except TypeError as te:
476 ... print(te)
477 collector should be a callable but is int, namely 1.
479 >>> try:
480 ... __make_full_url_replacer(coll.append, None, "bb")
481 ... except TypeError as te:
482 ... print(te)
483 descriptor '__len__' requires a 'str' object but received a 'NoneType'
485 >>> try:
486 ... __make_full_url_replacer(coll.append, 1, "bb")
487 ... except TypeError as te:
488 ... print(te)
489 descriptor '__len__' requires a 'str' object but received a 'int'
491 >>> try:
492 ... __make_full_url_replacer(coll.append, "http://example.com", None)
493 ... except TypeError as te:
494 ... print(te)
495 descriptor 'strip' for 'str' objects doesn't apply to a 'NoneType' object
497 >>> try:
498 ... __make_full_url_replacer(coll.append, "http://example.com", 1)
499 ... except TypeError as te:
500 ... print(te)
501 descriptor 'strip' for 'str' objects doesn't apply to a 'int' object
503 >>> try:
504 ... __make_full_url_replacer(coll.append, "tweise@hfuu.edu.cn", "1")
505 ... except ValueError as ve:
506 ... print(ve)
507 Invalid scheme 'mailto' for url 'tweise@hfuu.edu.cn'.
509 >>> try:
510 ... __make_full_url_replacer(coll.append, "http://example.com", " ")
511 ... except ValueError as ve:
512 ... print(ve)
513 Cannot replace URL 'http://example.com' with ' '.
515 >>> try:
516 ... __make_full_url_replacer(coll.append, "http://example.com",
517 ... ".", None)
518 ... except TypeError as te:
519 ... print(te)
520 for_markdown should be an instance of bool but is None.
522 >>> try:
523 ... __make_full_url_replacer(coll.append, "http://example.com",
524 ... ".", 1)
525 ... except TypeError as te:
526 ... print(te)
527 for_markdown should be an instance of bool but is int, namely 1.
528 """
529 if not callable(collector):
530 raise type_error(collector, "collector", call=True)
531 if not isinstance(for_markdown, bool):
532 raise type_error(for_markdown, "for_markdown", bool)
533 url: Final[URL] = URL(url_to_replace)
534 if url.scheme not in {"http", "https"}:
535 raise ValueError(
536 f"Invalid scheme {url.scheme!r} for url {url_to_replace!r}.")
537 use_repl: Final[str] = str.strip(replace_url_with)
538 if str.__len__(use_repl) <= 0:
539 raise ValueError(
540 f"Cannot replace URL {url!r} with {replace_url_with!r}.")
542 use_url: str = f"https?{escape(url[str.__len__(url.scheme):])}"
543 if use_repl.startswith(("/", "./")):
544 use_url += "?" if use_url.endswith("/") else r"\/?"
546 for sr, er, ss, es in chain(__HTML_URL_SEPARTORS, __MD_URL_SEPARTORS) \
547 if for_markdown else __HTML_URL_SEPARTORS:
548 collector((
549 cast("Pattern", re_compile(f"{sr}{use_url}{er}")),
550 f"{ss}{use_repl}{es}"))
553def __make_replacer(replacers: Iterable[tuple[
554 Pattern, Callable[[Match], str] | str]]) -> Callable[[str], str]:
555 """
556 Make a function that replaces all URL parts in a string.
558 :param replacers: the replacers patterns
559 :returns: the function that can apply the replacers
561 >>> coll = list()
562 >>> __make_full_url_replacer(coll.append, "https://example.com/log.txt",
563 ... "https://example.org/log.txt")
564 >>> __make_base_url_replacer(coll.append, "https://example.com/", "./")
565 >>> f = __make_replacer(coll)
566 >>> f("bla <a href='https://example.com/log.txt'>x</a> bla")
567 'bla <a href="https://example.org/log.txt">x</a> bla'
568 >>> f("bla <a href='https://example.com/xlog.txt'>x</a> bla")
569 'bla <a href="./xlog.txt">x</a> bla'
571 >>> try:
572 ... f(None)
573 ... except TypeError as te:
574 ... print(te)
575 descriptor 'rstrip' for 'str' objects doesn't apply to a 'NoneType' object
577 >>> try:
578 ... f(1)
579 ... except TypeError as te:
580 ... print(te)
581 descriptor 'rstrip' for 'str' objects doesn't apply to a 'int' object
583 >>> coll.append((None, coll[0][1]))
584 >>> try:
585 ... __make_replacer(coll)
586 ... except TypeError as te:
587 ... print(te)
588 pattern should be an instance of re.Pattern but is None.
590 >>> coll[-1] = (1, coll[0][1])
591 >>> try:
592 ... __make_replacer(coll)
593 ... except TypeError as te:
594 ... print(te)
595 pattern should be an instance of re.Pattern but is int, namely 1.
597 >>> coll[-1] = (coll[0][0], None)
598 >>> try:
599 ... __make_replacer(coll)
600 ... except TypeError as te:
601 ... print(te)
602 replacer should be an instance of str or a callable but is None.
604 >>> coll[-1] = (coll[0][0], 1)
605 >>> try:
606 ... __make_replacer(coll)
607 ... except TypeError as te:
608 ... print(te)
609 replacer should be an instance of str or a callable but is int, namely 1.
611 >>> try:
612 ... __make_replacer(None)
613 ... except TypeError as te:
614 ... print(te)
615 replacers should be an instance of typing.Iterable but is None.
617 >>> try:
618 ... __make_replacer(1)
619 ... except TypeError as te:
620 ... print(te)
621 replacers should be an instance of typing.Iterable but is int, namely 1.
623 >>> coll = list()
624 >>> __make_full_url_replacer(coll.append, "https://example.com",
625 ... "https://example.com")
626 >>> f = __make_replacer(coll)
627 >>> try:
628 ... f("<a href='http://example.com' />")
629 ... except ValueError as ve:
630 ... print(str(ve)[:60])
631 Endless loop: "<a href='http://example.com' />" -> '<a href=
632 """
633 if not isinstance(replacers, Iterable):
634 raise type_error(replacers, "replacers", Iterable)
636 pats: Final[tuple[tuple[Pattern, Callable[[Match], str] | str],
637 ...]] = tuple(replacers)
638 for pattern, replacer in pats:
639 if not isinstance(pattern, Pattern):
640 raise type_error(pattern, "pattern", Pattern)
641 if not (isinstance(replacer, str) or callable(replacer)):
642 raise type_error(replacer, "replacer", str, True)
644 def __func(text: str, __pats=pats) -> str:
645 out_str: str = str.rstrip(text) # enforce string
646 if str.__len__(out_str) <= 0:
647 return ""
648 rc: int = 1
649 iteration: int = 0
650 while rc > 0:
651 rc = 0
652 for pp, rr in __pats:
653 out_str, nn = subn(pp, rr, out_str)
654 rc += nn
655 iteration += 1
656 if iteration > 100:
657 raise ValueError(f"Endless loop: {text!r} -> {out_str!r}.")
658 return str.rstrip(out_str) # enforce string
660 return cast("Callable[[str], str]", __func)
663def make_url_replacer(base_urls: Mapping[str, str] | None = None,
664 full_urls: Mapping[str, str] | None = None,
665 for_markdown: bool = True) \
666 -> Callable[[str], str]:
667 r"""
668 Create the url replacers that fix absolute to relative URLs.
670 :param base_urls: a mapping of basic urls to shortcuts
671 :param full_urls: a mapping of full urls to abbreviations
672 :param for_markdown: should the replace be for Markdown (`True`) or for
673 HTML only (`False`)
674 :returns: a single callable that can process strings and fix the URLs
675 therein
676 :raises TypeError: if any of the inputs is of the wrong type
677 :raises ValueError: if any of the inputs is incorrect
679 >>> f = make_url_replacer(None, None)
680 >>> f("1")
681 '1'
683 >>> f = make_url_replacer({"https://example.com/1": "./a/",
684 ... "https://example.com": "./"},
685 ... {"https://example.com/1/1.txt": "y.txt",
686 ... "https://example.com/x/1.txt": "z.txt"})
687 >>> f("<a href='http://example.com/1/2.txt' />")
688 '<a href="./a/2.txt" />'
689 >>> f("<a href='http://example.com/1' />")
690 '<a href="./a/" />'
691 >>> f("<a href='http://example.com' />")
692 '<a href="./" />'
693 >>> f("<a href='http://example.com/x.txt' />")
694 '<a href="./x.txt" />'
695 >>> f("<a href='http://example.com/1/1.txt' />")
696 '<a href="y.txt" />'
697 >>> f("<a href='http://example.com/x/1.txt' />")
698 '<a href="z.txt" />'
700 >>> try:
701 ... make_url_replacer(1, None)
702 ... except TypeError as te:
703 ... print(te)
704 base_urls should be an instance of typing.Mapping but is int, namely 1.
706 >>> try:
707 ... make_url_replacer(None, 1)
708 ... except TypeError as te:
709 ... print(te)
710 full_urls should be an instance of typing.Mapping but is int, namely 1.
712 >>> try:
713 ... make_url_replacer(None, None, None)
714 ... except TypeError as te:
715 ... print(te)
716 for_markdown should be an instance of bool but is None.
718 >>> try:
719 ... make_url_replacer(None, None, 1)
720 ... except TypeError as te:
721 ... print(te)
722 for_markdown should be an instance of bool but is int, namely 1.
723 """
724 if not isinstance(for_markdown, bool):
725 raise type_error(for_markdown, "for_markdown", bool)
726 keys: list[tuple[str, bool]] = []
728 if base_urls is not None:
729 if not isinstance(base_urls, Mapping):
730 raise type_error(base_urls, "base_urls", Mapping)
731 keys.extend((kk, False) for kk in base_urls)
732 if full_urls is not None:
733 if not isinstance(full_urls, Mapping):
734 raise type_error(full_urls, "full_urls", Mapping)
735 keys.extend((kk, True) for kk in full_urls)
737 if list.__len__(keys) <= 0: # no need to do anything
738 return lambda s: s
740 # long keys and full urls first
741 keys.sort(key=lambda tt: (str.__len__(tt[0]), tt[1], tt[0]), reverse=True)
742 mappings: list[tuple[Pattern, Callable[[Match], str] | str]] = []
743 for k, w in keys:
744 if w:
745 __make_full_url_replacer(mappings.append, k, full_urls[k],
746 for_markdown)
747 else:
748 __make_base_url_replacer(mappings.append, k, base_urls[k],
749 for_markdown)
750 return __make_replacer(mappings)