Coverage for pycommons / net / url.py: 98%
86 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
1"""Come string splitting and processing routines."""
3from re import Match, search
4from re import compile as _compile
5from typing import (
6 Any,
7 Final,
8 Pattern,
9 cast,
10)
11from urllib.parse import ParseResult, urljoin, urlparse
13# noinspection PyPackageRequirements
14from pycommons.strings.chars import WHITESPACE_OR_NEWLINE
15from pycommons.types import check_int_range
17#: text that is forbidden in a URL
18_FORBIDDEN_IN_RELATIVE_URL: Final[Pattern] = _compile(
19 f"@.*@|[{WHITESPACE_OR_NEWLINE}"
20 r"\\%*?&+\"'=$§!,;|<>\[\](){}²³°^]+|://.*://")
22#: text that is forbidden in a fully-expanded URL
23_FORBIDDEN_IN_FULL_URL: Final[Pattern] = _compile(
24 _FORBIDDEN_IN_RELATIVE_URL.pattern + r"|\.\.|\/\.+\/|\A\.+\Z")
26#: text that is forbidden in a fragment
27_FORBIDDEN_IN_FRAGMENT: Final[Pattern] = _compile(
28 _FORBIDDEN_IN_FULL_URL.pattern + r"|#")
31def _check_url_part(part: Any, forbidden: Pattern) -> str:
32 """
33 Check an url part.
35 :param part: the part
36 :param forbidden: the pattern of forbidden text
37 :returns: the url as str
39 >>> try:
40 ... _check_url_part("", _FORBIDDEN_IN_RELATIVE_URL)
41 ... except ValueError as ve:
42 ... print(ve)
43 URL part '' has invalid length 0.
45 >>> try:
46 ... _check_url_part(" ", _FORBIDDEN_IN_RELATIVE_URL)
47 ... except ValueError as ve:
48 ... print(ve)
49 URL part ' ' contains the forbidden text ' '.
51 >>> try:
52 ... _check_url_part("Äquator", _FORBIDDEN_IN_RELATIVE_URL)
53 ... except ValueError as ve:
54 ... print(ve)
55 URL part 'Äquator' contains non-ASCII characters.
57 >>> try:
58 ... _check_url_part("2" * 260, _FORBIDDEN_IN_RELATIVE_URL)
59 ... except ValueError as ve:
60 ... print(str(ve)[:60])
61 URL part '22222222222222222222222222222222222222222222222222
63 >>> try:
64 ... _check_url_part(None, _FORBIDDEN_IN_RELATIVE_URL)
65 ... except TypeError as te:
66 ... print(te)
67 descriptor '__len__' requires a 'str' object but received a 'NoneType'
69 >>> try:
70 ... _check_url_part(2, _FORBIDDEN_IN_RELATIVE_URL)
71 ... except TypeError as te:
72 ... print(te)
73 descriptor '__len__' requires a 'str' object but received a 'int'
75 >>> isinstance(_check_url_part("123", _FORBIDDEN_IN_RELATIVE_URL), str)
76 True
78 >>> try:
79 ... _check_url_part(3, _FORBIDDEN_IN_RELATIVE_URL)
80 ... except TypeError as te:
81 ... print(te)
82 descriptor '__len__' requires a 'str' object but received a 'int'
84 >>> try:
85 ... _check_url_part("3", 5)
86 ... except TypeError as te:
87 ... print(te)
88 first argument must be string or compiled pattern
89 """
90 if not (0 < str.__len__(part) < 255):
91 raise ValueError(f"URL part {part!r} has invalid length {len(part)}.")
92 the_match: Final[Match | None] = search(forbidden, part)
93 if the_match is not None:
94 raise ValueError(f"URL part {part!r} contains the forbidden "
95 f"text {the_match.group()!r}.")
96 urlstr: Final[str] = cast("str", part)
97 if not urlstr.isascii():
98 raise ValueError(
99 f"URL part {urlstr!r} contains non-ASCII characters.")
100 if urlstr.endswith(("#", "@")):
101 raise ValueError(
102 f"URL part must not end in {urlstr[-1]!r}, but {urlstr!r} does.")
103 return urlstr
106#: the mailto scheme
107_MAILTO_1: Final[str] = "mailto"
108#: the mailto prefix
109_MAILTO_2: Final[str] = _MAILTO_1 + ":"
110#: the mailto full prefix
111_MAILTO_3: Final[str] = _MAILTO_2 + "//"
112#: the ssh scheme
113_SSH: Final[str] = "ssh"
115#: the schemes that require usernames
116_REQUIRE_USER_NAME_SCHEMES: Final[set] = {_MAILTO_1, _SSH}
118#: the permitted URL schemes without '@'
119_ALLOWED_SCHEMES: Final[set] = {"http", "https"}.union(
120 _REQUIRE_USER_NAME_SCHEMES)
123class URL(str): # noqa: SLOT000
124 r"""
125 A normalized and expanded URL.
127 This is a very strict URL parsing routine. The idea is that it will only
128 produce URLs that are safe for use in almost any environment and throw
129 exceptions otherwise.
131 We limit the URLs to very few different types and allowed schemes.
132 Non-ASCII characters are not allowed, and neither are spaces, `'%'`,
133 `'*'`, `'?'`, `'+'`, `'&'`, `'<'`, `'>'`, `','`, `'$'`, `'§'`, `"'"`,
134 `'"'`, `'['`, `']'`, `'{'`, `'}'`, `'('`, `')'`, ` nor `'\'` and a few
135 more.
137 We also allow `'@'` to occur at most once. This means that URLs cannot
138 have any parameters and also that URL-escaping non-ASCII characters is not
139 possible either. We thus limit the URLs to mainly static content pointers.
141 We also only permit simple schemes such as `http`, `https`, `mailto`, and
142 `ssh`.
144 The final URL also cannot contain any `'/./'` or `'/../'` or consist of
145 any component that equals `'..'`. No URL or component must be longer than
146 255 characters either. It is also not allowed that `'://'` occurs twice.
147 If the URL is a `mailto` or `ssh` URL, it must provide a username
148 component.
150 If a port is provided, it must be greater than 0 and less than 65536.
151 If a port is specified, a host must be specified as well.
152 Only if a netloc is found, then a port or a host may be specified.
154 The URL `value` may be a relative URL that is turned into an absolute URL
155 using the base URL `base_url`. Of course, then the same restrictions apply
156 to the relative original URL, the base URL, and the final absolute URL.
158 This function tries to detect email addresses and turns them into valid
159 `mailto://` urls.
160 This function gobbles up single trailing `/` characters.
162 An instance of `URL` is also an instance of :class:`str`, so you can use
163 it as string whereever you want. It additionally offers the following
164 attributes:
166 - :attr:`~URL.scheme`: the URL scheme, e.g., `"http"`
167 - :attr:`~URL.netloc`: the URL network location, including user (if any),
168 host, and port (if any)
169 - :attr:`~URL.host`: the host of the URL
170 - :attr:`~URL.port`: the port of the URL, or `None` if no port is
171 specified
172 - :attr:`~URL.path`: the path part of the URL (without the
173 :attr:`~URL.fragment` part, if any), or `None` if no path part is
174 specified
175 - :attr:`~URL.fragment`: the fragment part of the path, or `None` if the
176 path has no fragment
179 >>> u1 = URL("mailto:tweise@hfuu.edu.cn")
180 >>> print(u1)
181 mailto://tweise@hfuu.edu.cn
182 >>> print(u1.scheme)
183 mailto
184 >>> print(u1.netloc)
185 tweise@hfuu.edu.cn
186 >>> print(u1.host)
187 hfuu.edu.cn
188 >>> print(u1.port)
189 None
190 >>> print(u1.path)
191 None
192 >>> print(u1.fragment)
193 None
195 >>> u = URL("tweise@hfuu.edu.cn")
196 >>> print(u)
197 mailto://tweise@hfuu.edu.cn
198 >>> print(u.scheme)
199 mailto
200 >>> print(u.netloc)
201 tweise@hfuu.edu.cn
202 >>> print(u.host)
203 hfuu.edu.cn
204 >>> print(u.port)
205 None
206 >>> print(u.path)
207 None
208 >>> print(u.fragment)
209 None
211 >>> URL("mailto://tweise@hfuu.edu.cn")
212 'mailto://tweise@hfuu.edu.cn'
214 >>> u2 = URL("https://example.com/abc")
215 >>> print(u2)
216 https://example.com/abc
217 >>> print(u2.scheme)
218 https
219 >>> print(u2.netloc)
220 example.com
221 >>> print(u2.host)
222 example.com
223 >>> print(u2.port)
224 None
225 >>> print(u2.path)
226 /abc
227 >>> print(u2.fragment)
228 None
229 >>> u1.host != u2.host
230 True
232 >>> u = URL("https://example.com/abc/")
233 >>> print(u)
234 https://example.com/abc
235 >>> print(u.scheme)
236 https
237 >>> print(u.netloc)
238 example.com
239 >>> print(u.host)
240 example.com
241 >>> print(u.port)
242 None
243 >>> print(u.path)
244 /abc
245 >>> print(u.fragment)
246 None
248 >>> u = URL("https://example.com/")
249 >>> print(u)
250 https://example.com
251 >>> print(u.scheme)
252 https
253 >>> print(u.netloc)
254 example.com
255 >>> print(u.host)
256 example.com
257 >>> print(u.port)
258 None
259 >>> print(u.path)
260 None
261 >>> print(u.fragment)
262 None
264 >>> u = URL("ssh://git@example.com/abc")
265 >>> print(u)
266 ssh://git@example.com/abc
267 >>> print(u.scheme)
268 ssh
269 >>> print(u.netloc)
270 git@example.com
271 >>> print(u.host)
272 example.com
273 >>> print(u.port)
274 None
275 >>> print(u.path)
276 /abc
277 >>> print(u.fragment)
278 None
280 >>> URL("1.txt", "http://example.com/thomasWeise")
281 'http://example.com/1.txt'
283 >>> URL("1.txt", "http://example.com/thomasWeise/")
284 'http://example.com/thomasWeise/1.txt'
286 >>> URL("../1.txt", "http://example.com/thomasWeise/")
287 'http://example.com/1.txt'
289 >>> URL("https://example.com/1.txt",
290 ... "http://github.com/thomasWeise/")
291 'https://example.com/1.txt'
293 >>> URL("http://example.com:123/1")
294 'http://example.com:123/1'
296 >>> u = URL("http://example.com:34/index.html#1")
297 >>> print(u)
298 http://example.com:34/index.html#1
299 >>> print(u.scheme)
300 http
301 >>> print(u.netloc)
302 example.com:34
303 >>> print(u.host)
304 example.com
305 >>> print(u.port)
306 34
307 >>> print(u.path)
308 /index.html
309 >>> print(u.fragment)
310 1
312 >>> try:
313 ... URL("tweise@@hfuu.edu.cn")
314 ... except ValueError as ve:
315 ... print(ve)
316 URL part 'tweise@@hfuu.edu.cn' contains the forbidden text '@@'.
318 >>> try:
319 ... URL("http://example.com/index.html#")
320 ... except ValueError as ve:
321 ... print(ve)
322 URL part must not end in '#', but 'http://example.com/index.html#' does.
324 >>> try:
325 ... URL("http://example.com/index.html@")
326 ... except ValueError as ve:
327 ... print(ve)
328 URL part must not end in '@', but 'http://example.com/index.html@' does.
330 >>> try:
331 ... URL("https://example.com/abc(/23")
332 ... except ValueError as ve:
333 ... print(ve)
334 URL part 'https://example.com/abc(/23' contains the forbidden text '('.
336 >>> try:
337 ... URL("https://example.com/abc]/23")
338 ... except ValueError as ve:
339 ... print(ve)
340 URL part 'https://example.com/abc]/23' contains the forbidden text ']'.
342 >>> try:
343 ... URL("https://example.com/abcä/23")
344 ... except ValueError as ve:
345 ... print(ve)
346 URL part 'https://example.com/abcä/23' contains non-ASCII characters.
348 >>> try:
349 ... URL("https://example.com/abc/./23")
350 ... except ValueError as ve:
351 ... print(ve)
352 URL part 'https://example.com/abc/./23' contains the forbidden text '/./'.
354 >>> try:
355 ... URL("https://example.com/abc/../1.txt")
356 ... except ValueError as ve:
357 ... print(str(ve)[:-4])
358 URL part 'https://example.com/abc/../1.txt' contains the forbidden text '/.
360 >>> try:
361 ... URL(r"https://example.com/abc\./23")
362 ... except ValueError as ve:
363 ... print(ve)
364 URL part 'https://example.com/abc\\./23' contains the forbidden text '\\'.
366 >>> try:
367 ... URL("https://1.2.com/abc/23/../r")
368 ... except ValueError as ve:
369 ... print(ve)
370 URL part 'https://1.2.com/abc/23/../r' contains the forbidden text '/../'.
372 >>> try:
373 ... URL("https://exa mple.com")
374 ... except ValueError as ve:
375 ... print(ve)
376 URL part 'https://exa mple.com' contains the forbidden text ' '.
378 >>> try:
379 ... URL("ftp://example.com")
380 ... except ValueError as ve:
381 ... print(str(ve)[:66])
382 Invalid scheme 'ftp' of url 'ftp://example.com' under base None, o
384 >>> try:
385 ... URL("http://example.com%32")
386 ... except ValueError as ve:
387 ... print(str(ve))
388 URL part 'http://example.com%32' contains the forbidden text '%'.
390 >>> try:
391 ... URL("mailto://example.com")
392 ... except ValueError as ve:
393 ... print(str(ve)[:66])
394 'mailto' url 'mailto://example.com' must contain '@' and have user
396 >>> try:
397 ... URL("ssh://example.com")
398 ... except ValueError as ve:
399 ... print(str(ve)[:65])
400 'ssh' url 'ssh://example.com' must contain '@' and have username,
402 >>> try:
403 ... URL("ftp://example.com*32")
404 ... except ValueError as ve:
405 ... print(str(ve))
406 URL part 'ftp://example.com*32' contains the forbidden text '*'.
408 >>> try:
409 ... URL("http://example.com/https://h")
410 ... except ValueError as ve:
411 ... print(str(ve)[:74])
412 URL part 'http://example.com/https://h' contains the forbidden text '://ex
414 >>> try:
415 ... URL("http://user@example.com")
416 ... except ValueError as ve:
417 ... print(str(ve)[:66])
418 'http' url 'http://user@example.com' must not contain '@' and have
420 >>> try:
421 ... URL("http://" + ("a" * 250))
422 ... except ValueError as ve:
423 ... print(str(ve)[-30:])
424 aaaaa' has invalid length 257.
426 >>> try:
427 ... URL("http://.")
428 ... except ValueError as ve:
429 ... print(ve)
430 URL part '.' contains the forbidden text '.'.
432 >>> try:
433 ... URL("http://..")
434 ... except ValueError as ve:
435 ... print(ve)
436 URL part 'http://..' contains the forbidden text '..'.
438 >>> try:
439 ... URL("http://www.example.com/../1")
440 ... except ValueError as ve:
441 ... print(ve)
442 URL part 'http://www.example.com/../1' contains the forbidden text '/../'.
444 >>> try:
445 ... URL("http://www.example.com/./1")
446 ... except ValueError as ve:
447 ... print(ve)
448 URL part 'http://www.example.com/./1' contains the forbidden text '/./'.
450 >>> try:
451 ... URL("http://user@example.com/@1")
452 ... except ValueError as ve:
453 ... print(str(ve)[:-9])
454 URL part 'http://user@example.com/@1' contains the forbidden text '@exampl
456 >>> try:
457 ... URL("http://:45/1.txt")
458 ... except ValueError as ve:
459 ... print(ve)
460 URL 'http://:45/1.txt' has no host?
462 >>> try:
463 ... URL("http://example.com:-3/@1")
464 ... except ValueError as ve:
465 ... print(ve)
466 Port could not be cast to integer value as '-3'
468 >>> try:
469 ... URL("http://example.com:0/@1")
470 ... except ValueError as ve:
471 ... print(ve)
472 port=0 is invalid, must be in 1..65535.
474 >>> try:
475 ... URL("http://example.com:65536/@1")
476 ... except ValueError as ve:
477 ... print(ve)
478 Port out of range 0-65535
480 >>> try:
481 ... URL(1)
482 ... except TypeError as te:
483 ... print(te)
484 descriptor '__len__' requires a 'str' object but received a 'int'
486 >>> try:
487 ... URL(None)
488 ... except TypeError as te:
489 ... print(te)
490 descriptor '__len__' requires a 'str' object but received a 'NoneType'
492 >>> try:
493 ... URL("http::/1.txt", 1)
494 ... except TypeError as te:
495 ... print(te)
496 descriptor '__len__' requires a 'str' object but received a 'int'
498 >>> try:
499 ... URL("http::/1.txt?x=1")
500 ... except ValueError as ve:
501 ... print(ve)
502 URL part 'http::/1.txt?x=1' contains the forbidden text '?'.
504 >>> try:
505 ... URL("http::/1.txt&x=1")
506 ... except ValueError as ve:
507 ... print(ve)
508 URL part 'http::/1.txt&x=1' contains the forbidden text '&'.
510 >>> try:
511 ... URL("http::/1.+txt&x=1")
512 ... except ValueError as ve:
513 ... print(ve)
514 URL part 'http::/1.+txt&x=1' contains the forbidden text '+'.
516 >>> try:
517 ... URL("http::/1*.+txt&x=1")
518 ... except ValueError as ve:
519 ... print(ve)
520 URL part 'http::/1*.+txt&x=1' contains the forbidden text '*'.
522 >>> try:
523 ... URL("http://example.com#1#2")
524 ... except ValueError as ve:
525 ... print(ve)
526 URL part '1#2' contains the forbidden text '#'.
527 """
529 #: the protocol scheme, e.g., `"https"`
530 scheme: Final[str] # type: ignore
531 #: the network location, usually of the form `"user@host:port"`, i.e.,
532 #: composed of user name (if present), host, and port (if present)
533 netloc: Final[str] # type: ignore
534 #: the host str
535 host: Final[str] # type: ignore
536 #: the port, if any (else `None`)
537 port: Final[int | None] # type: ignore
538 #: the path, if any (else `None`), but without the fragment component
539 path: Final[str | None] # type: ignore
540 #: the path fragment, i.e., the part following a `"#"`, if any (else
541 #: `None`)
542 fragment: Final[str | None] # type: ignore
544 def __new__(cls, value: Any, base_url: Any | None = None):
545 """
546 Create the URL.
548 :param value: either the full absolute URL or a URL that should be
549 resolved against the URL `base_url`
550 :param base_url: the base URL to resolve `value` against, or `None` if
551 `value` is already an absolute URL
552 """
553 if isinstance(value, URL):
554 return cast("URL", value)
556 url: str = _check_url_part(
557 value, _FORBIDDEN_IN_FULL_URL if base_url is None
558 else _FORBIDDEN_IN_RELATIVE_URL)
559 if base_url is not None:
560 url = _check_url_part(urljoin(_check_url_part(
561 base_url, _FORBIDDEN_IN_FULL_URL), url),
562 _FORBIDDEN_IN_FULL_URL)
564 url = url.removesuffix("/")
566 # normalize mailto URLs that do not contain //
567 is_mailto: bool = url.startswith(_MAILTO_2)
568 if is_mailto and (not url.startswith(_MAILTO_3)):
569 url = _MAILTO_3 + url[str.__len__(_MAILTO_2):]
571 res: ParseResult = urlparse(url)
572 scheme: str | None = res.scheme
573 if ((scheme is None) or (str.__len__(scheme) == 0)) and (
574 url.count("@") == 1):
575 res = urlparse(_MAILTO_3 + url)
576 scheme = res.scheme
577 is_mailto = True
578 scheme = _check_url_part(scheme, _FORBIDDEN_IN_FULL_URL)
580 if scheme not in _ALLOWED_SCHEMES:
581 raise ValueError(
582 f"Invalid scheme {scheme!r} of url {url!r} under base "
583 f"{base_url!r}, only {_ALLOWED_SCHEMES!r} are "
584 "permitted.")
586 netloc: Final[str] = _check_url_part(
587 res.netloc, _FORBIDDEN_IN_FULL_URL)
589 host: Final[str] = res.hostname
590 if host is None:
591 raise ValueError(f"URL {url!r} has no host?")
592 _check_url_part(host, _FORBIDDEN_IN_FULL_URL)
593 port: Final[int | None] = res.port
594 if port is not None:
595 check_int_range(port, "port", 1, 65535)
597 path: str | None = res.path
598 if str.__len__(path) > 0:
599 _check_url_part(path, _FORBIDDEN_IN_FULL_URL)
600 else:
601 path = None
603 if is_mailto != (scheme == _MAILTO_1): # this should be impossible
604 raise ValueError(f"url {url!r} has scheme {scheme!r}?")
605 requires_at: Final[bool] = is_mailto or (
606 scheme in _REQUIRE_USER_NAME_SCHEMES)
607 has_at: Final[bool] = "@" in netloc
608 has_user: Final[bool] = (res.username is not None) and (
609 str.__len__(res.username) > 0)
610 if requires_at != (has_at and has_user):
611 raise ValueError(
612 f"{scheme!r} url {url!r} must {'' if requires_at else 'not '}"
613 f"contain '@' and have username, but got "
614 f"{'@' if has_at else 'no @'} and "
615 f"{repr(res.username) if has_user else 'no username'}.")
617 if ((str.__len__(res.query) != 0) or (str.__len__(res.params) != 0)
618 or (res.password is not None)):
619 # should be impossible, as our regex check already picks this up
620 raise ValueError(
621 f"Query/parameters/password found in url {url!r}.")
623 fragment: str | None = res.fragment
624 if str.__len__(fragment) <= 0:
625 fragment = None
626 else:
627 _check_url_part(fragment, _FORBIDDEN_IN_FRAGMENT)
629 result = super().__new__(cls, _check_url_part(
630 res.geturl(), _FORBIDDEN_IN_FULL_URL))
632 #: the protocol scheme
633 result.scheme: Final[str] = scheme # type: ignore
634 #: the network location: user@host:port
635 result.netloc: Final[str] = netloc # type: ignore
636 #: the host
637 result.host: Final[str] = host # type: ignore
638 #: the port, if any (else `None`)
639 result.port: Final[int | None] = port # type: ignore
640 #: the path, if any (else `None`)
641 result.path: Final[str | None] = path # type: ignore
642 #: the path fragment, if any (else `None`)
643 result.fragment: Final[str | None] = fragment # type: ignore
644 return result