Coverage for pycommons / net / url.py: 98%
86 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-24 03:11 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-24 03:11 +0000
1"""
2A string class representing a URL.
4Like the class :class:`~pycommons.io.path.Path` in
5:mod:`pycommons.io.path` does for paths in the file system, the class
6:class:`~pycommons.net.url.URL` offers some sort of canonical and very
7conservative representation of URLs, which, at the same time, is also a
8string.
9This makes it convenient pass the instances of this class into functions
10that otherwise expect strings.
11It also allows you to write functions that expect strings and URLs as
12parameter.
14>>> u = URL("https://thomasweise.github.io/contact/#address-in-english")
15>>> u
16'https://thomasweise.github.io/contact/#address-in-english'
17>>> u.scheme
18'https'
19>>> u.host
20'thomasweise.github.io'
21>>> u.path
22'/contact/'
23>>> u.fragment
24'address-in-english'
26>>> u = URL("http://thomasweise.github.io/contact/")
27>>> u
28'http://thomasweise.github.io/contact'
29>>> u.scheme
30'http'
31>>> u.host
32'thomasweise.github.io'
33>>> u.path
34'/contact'
35>>> print(u.fragment)
36None
37"""
39from re import Match, search
40from re import compile as _compile
41from typing import (
42 Any,
43 Final,
44 Pattern,
45 cast,
46)
47from urllib.parse import ParseResult, urljoin, urlparse
49# noinspection PyPackageRequirements
50from pycommons.strings.chars import WHITESPACE_OR_NEWLINE
51from pycommons.types import check_int_range
53#: text that is forbidden in a URL
54_FORBIDDEN_IN_RELATIVE_URL: Final[Pattern] = _compile(
55 f"@.*@|[{WHITESPACE_OR_NEWLINE}"
56 r"\\%*?&+\"'=$§!,;|<>\[\](){}²³°^]+|://.*://")
58#: text that is forbidden in a fully-expanded URL
59_FORBIDDEN_IN_FULL_URL: Final[Pattern] = _compile(
60 _FORBIDDEN_IN_RELATIVE_URL.pattern + r"|\.\.|\/\.+\/|\A\.+\Z")
62#: text that is forbidden in a fragment
63_FORBIDDEN_IN_FRAGMENT: Final[Pattern] = _compile(
64 _FORBIDDEN_IN_FULL_URL.pattern + r"|#")
67def _check_url_part(part: Any, forbidden: Pattern) -> str:
68 """
69 Check an url part.
71 :param part: the part
72 :param forbidden: the pattern of forbidden text
73 :returns: the url as str
75 >>> try:
76 ... _check_url_part("", _FORBIDDEN_IN_RELATIVE_URL)
77 ... except ValueError as ve:
78 ... print(ve)
79 URL part '' has invalid length 0.
81 >>> try:
82 ... _check_url_part(" ", _FORBIDDEN_IN_RELATIVE_URL)
83 ... except ValueError as ve:
84 ... print(ve)
85 URL part ' ' contains the forbidden text ' '.
87 >>> try:
88 ... _check_url_part("Äquator", _FORBIDDEN_IN_RELATIVE_URL)
89 ... except ValueError as ve:
90 ... print(ve)
91 URL part 'Äquator' contains non-ASCII characters.
93 >>> try:
94 ... _check_url_part("2" * 260, _FORBIDDEN_IN_RELATIVE_URL)
95 ... except ValueError as ve:
96 ... print(str(ve)[:60])
97 URL part '22222222222222222222222222222222222222222222222222
99 >>> try:
100 ... _check_url_part(None, _FORBIDDEN_IN_RELATIVE_URL)
101 ... except TypeError as te:
102 ... print(te)
103 descriptor '__len__' requires a 'str' object but received a 'NoneType'
105 >>> try:
106 ... _check_url_part(2, _FORBIDDEN_IN_RELATIVE_URL)
107 ... except TypeError as te:
108 ... print(te)
109 descriptor '__len__' requires a 'str' object but received a 'int'
111 >>> isinstance(_check_url_part("123", _FORBIDDEN_IN_RELATIVE_URL), str)
112 True
114 >>> try:
115 ... _check_url_part(3, _FORBIDDEN_IN_RELATIVE_URL)
116 ... except TypeError as te:
117 ... print(te)
118 descriptor '__len__' requires a 'str' object but received a 'int'
120 >>> try:
121 ... _check_url_part("3", 5)
122 ... except TypeError as te:
123 ... print(te)
124 first argument must be string or compiled pattern
125 """
126 if not (0 < str.__len__(part) < 255):
127 raise ValueError(f"URL part {part!r} has invalid length {len(part)}.")
128 the_match: Final[Match | None] = search(forbidden, part)
129 if the_match is not None:
130 raise ValueError(f"URL part {part!r} contains the forbidden "
131 f"text {the_match.group()!r}.")
132 urlstr: Final[str] = cast("str", part)
133 if not urlstr.isascii():
134 raise ValueError(
135 f"URL part {urlstr!r} contains non-ASCII characters.")
136 if urlstr.endswith(("#", "@")):
137 raise ValueError(
138 f"URL part must not end in {urlstr[-1]!r}, but {urlstr!r} does.")
139 return urlstr
142#: the mailto scheme
143_MAILTO_1: Final[str] = "mailto"
144#: the mailto prefix
145_MAILTO_2: Final[str] = _MAILTO_1 + ":"
146#: the mailto full prefix
147_MAILTO_3: Final[str] = _MAILTO_2 + "//"
148#: the ssh scheme
149_SSH: Final[str] = "ssh"
151#: the schemes that require usernames
152_REQUIRE_USER_NAME_SCHEMES: Final[set] = {_MAILTO_1, _SSH}
154#: the permitted URL schemes without '@'
155_ALLOWED_SCHEMES: Final[set] = {"http", "https"}.union(
156 _REQUIRE_USER_NAME_SCHEMES)
159class URL(str): # noqa: SLOT000
160 r"""
161 A normalized and expanded URL.
163 This is a very strict URL parsing routine. The idea is that it will only
164 produce URLs that are safe for use in almost any environment and throw
165 exceptions otherwise.
167 We limit the URLs to very few different types and allowed schemes.
168 Non-ASCII characters are not allowed, and neither are spaces, `'%'`,
169 `'*'`, `'?'`, `'+'`, `'&'`, `'<'`, `'>'`, `','`, `'$'`, `'§'`, `"'"`,
170 `'"'`, `'['`, `']'`, `'{'`, `'}'`, `'('`, `')'`, ` nor `'\'` and a few
171 more.
173 We also allow `'@'` to occur at most once. This means that URLs cannot
174 have any parameters and also that URL-escaping non-ASCII characters is not
175 possible either. We thus limit the URLs to mainly static content pointers.
177 We also only permit simple schemes such as `http`, `https`, `mailto`, and
178 `ssh`.
180 The final URL also cannot contain any `'/./'` or `'/../'` or consist of
181 any component that equals `'..'`. No URL or component must be longer than
182 255 characters either. It is also not allowed that `'://'` occurs twice.
183 If the URL is a `mailto` or `ssh` URL, it must provide a username
184 component.
186 If a port is provided, it must be greater than 0 and less than 65536.
187 If a port is specified, a host must be specified as well.
188 Only if a netloc is found, then a port or a host may be specified.
190 The URL `value` may be a relative URL that is turned into an absolute URL
191 using the base URL `base_url`. Of course, then the same restrictions apply
192 to the relative original URL, the base URL, and the final absolute URL.
194 This function tries to detect email addresses and turns them into valid
195 `mailto://` urls.
196 This function gobbles up single trailing `/` characters.
198 An instance of `URL` is also an instance of :class:`str`, so you can use
199 it as string whereever you want. It additionally offers the following
200 attributes:
202 - :attr:`~URL.scheme`: the URL scheme, e.g., `"http"`
203 - :attr:`~URL.netloc`: the URL network location, including user (if any),
204 host, and port (if any)
205 - :attr:`~URL.host`: the host of the URL
206 - :attr:`~URL.port`: the port of the URL, or `None` if no port is
207 specified
208 - :attr:`~URL.path`: the path part of the URL (without the
209 :attr:`~URL.fragment` part, if any), or `None` if no path part is
210 specified
211 - :attr:`~URL.fragment`: the fragment part of the path, or `None` if the
212 path has no fragment
215 >>> u1 = URL("mailto:tweise@hfuu.edu.cn")
216 >>> print(u1)
217 mailto://tweise@hfuu.edu.cn
218 >>> print(u1.scheme)
219 mailto
220 >>> print(u1.netloc)
221 tweise@hfuu.edu.cn
222 >>> print(u1.host)
223 hfuu.edu.cn
224 >>> print(u1.port)
225 None
226 >>> print(u1.path)
227 None
228 >>> print(u1.fragment)
229 None
231 >>> u = URL("tweise@hfuu.edu.cn")
232 >>> print(u)
233 mailto://tweise@hfuu.edu.cn
234 >>> print(u.scheme)
235 mailto
236 >>> print(u.netloc)
237 tweise@hfuu.edu.cn
238 >>> print(u.host)
239 hfuu.edu.cn
240 >>> print(u.port)
241 None
242 >>> print(u.path)
243 None
244 >>> print(u.fragment)
245 None
247 >>> URL("mailto://tweise@hfuu.edu.cn")
248 'mailto://tweise@hfuu.edu.cn'
250 >>> u2 = URL("https://example.com/abc")
251 >>> print(u2)
252 https://example.com/abc
253 >>> print(u2.scheme)
254 https
255 >>> print(u2.netloc)
256 example.com
257 >>> print(u2.host)
258 example.com
259 >>> print(u2.port)
260 None
261 >>> print(u2.path)
262 /abc
263 >>> print(u2.fragment)
264 None
265 >>> u1.host != u2.host
266 True
268 >>> u = URL("https://example.com/abc/")
269 >>> print(u)
270 https://example.com/abc
271 >>> print(u.scheme)
272 https
273 >>> print(u.netloc)
274 example.com
275 >>> print(u.host)
276 example.com
277 >>> print(u.port)
278 None
279 >>> print(u.path)
280 /abc
281 >>> print(u.fragment)
282 None
284 >>> u = URL("https://example.com/")
285 >>> print(u)
286 https://example.com
287 >>> print(u.scheme)
288 https
289 >>> print(u.netloc)
290 example.com
291 >>> print(u.host)
292 example.com
293 >>> print(u.port)
294 None
295 >>> print(u.path)
296 None
297 >>> print(u.fragment)
298 None
300 >>> u = URL("ssh://git@example.com/abc")
301 >>> print(u)
302 ssh://git@example.com/abc
303 >>> print(u.scheme)
304 ssh
305 >>> print(u.netloc)
306 git@example.com
307 >>> print(u.host)
308 example.com
309 >>> print(u.port)
310 None
311 >>> print(u.path)
312 /abc
313 >>> print(u.fragment)
314 None
316 >>> URL("1.txt", "http://example.com/thomasWeise")
317 'http://example.com/1.txt'
319 >>> URL("1.txt", "http://example.com/thomasWeise/")
320 'http://example.com/thomasWeise/1.txt'
322 >>> URL("../1.txt", "http://example.com/thomasWeise/")
323 'http://example.com/1.txt'
325 >>> URL("https://example.com/1.txt",
326 ... "http://github.com/thomasWeise/")
327 'https://example.com/1.txt'
329 >>> URL("http://example.com:123/1")
330 'http://example.com:123/1'
332 >>> u = URL("http://example.com:34/index.html#1")
333 >>> print(u)
334 http://example.com:34/index.html#1
335 >>> print(u.scheme)
336 http
337 >>> print(u.netloc)
338 example.com:34
339 >>> print(u.host)
340 example.com
341 >>> print(u.port)
342 34
343 >>> print(u.path)
344 /index.html
345 >>> print(u.fragment)
346 1
348 >>> try:
349 ... URL("tweise@@hfuu.edu.cn")
350 ... except ValueError as ve:
351 ... print(ve)
352 URL part 'tweise@@hfuu.edu.cn' contains the forbidden text '@@'.
354 >>> try:
355 ... URL("http://example.com/index.html#")
356 ... except ValueError as ve:
357 ... print(ve)
358 URL part must not end in '#', but 'http://example.com/index.html#' does.
360 >>> try:
361 ... URL("http://example.com/index.html@")
362 ... except ValueError as ve:
363 ... print(ve)
364 URL part must not end in '@', but 'http://example.com/index.html@' does.
366 >>> try:
367 ... URL("https://example.com/abc(/23")
368 ... except ValueError as ve:
369 ... print(ve)
370 URL part 'https://example.com/abc(/23' contains the forbidden text '('.
372 >>> try:
373 ... URL("https://example.com/abc]/23")
374 ... except ValueError as ve:
375 ... print(ve)
376 URL part 'https://example.com/abc]/23' contains the forbidden text ']'.
378 >>> try:
379 ... URL("https://example.com/abcä/23")
380 ... except ValueError as ve:
381 ... print(ve)
382 URL part 'https://example.com/abcä/23' contains non-ASCII characters.
384 >>> try:
385 ... URL("https://example.com/abc/./23")
386 ... except ValueError as ve:
387 ... print(ve)
388 URL part 'https://example.com/abc/./23' contains the forbidden text '/./'.
390 >>> try:
391 ... URL("https://example.com/abc/../1.txt")
392 ... except ValueError as ve:
393 ... print(str(ve)[:-4])
394 URL part 'https://example.com/abc/../1.txt' contains the forbidden text '/.
396 >>> try:
397 ... URL(r"https://example.com/abc\./23")
398 ... except ValueError as ve:
399 ... print(ve)
400 URL part 'https://example.com/abc\\./23' contains the forbidden text '\\'.
402 >>> try:
403 ... URL("https://1.2.com/abc/23/../r")
404 ... except ValueError as ve:
405 ... print(ve)
406 URL part 'https://1.2.com/abc/23/../r' contains the forbidden text '/../'.
408 >>> try:
409 ... URL("https://exa mple.com")
410 ... except ValueError as ve:
411 ... print(ve)
412 URL part 'https://exa mple.com' contains the forbidden text ' '.
414 >>> try:
415 ... URL("ftp://example.com")
416 ... except ValueError as ve:
417 ... print(str(ve)[:66])
418 Invalid scheme 'ftp' of url 'ftp://example.com' under base None, o
420 >>> try:
421 ... URL("http://example.com%32")
422 ... except ValueError as ve:
423 ... print(str(ve))
424 URL part 'http://example.com%32' contains the forbidden text '%'.
426 >>> try:
427 ... URL("mailto://example.com")
428 ... except ValueError as ve:
429 ... print(str(ve)[:66])
430 'mailto' url 'mailto://example.com' must contain '@' and have user
432 >>> try:
433 ... URL("ssh://example.com")
434 ... except ValueError as ve:
435 ... print(str(ve)[:65])
436 'ssh' url 'ssh://example.com' must contain '@' and have username,
438 >>> try:
439 ... URL("ftp://example.com*32")
440 ... except ValueError as ve:
441 ... print(str(ve))
442 URL part 'ftp://example.com*32' contains the forbidden text '*'.
444 >>> try:
445 ... URL("http://example.com/https://h")
446 ... except ValueError as ve:
447 ... print(str(ve)[:74])
448 URL part 'http://example.com/https://h' contains the forbidden text '://ex
450 >>> try:
451 ... URL("http://user@example.com")
452 ... except ValueError as ve:
453 ... print(str(ve)[:66])
454 'http' url 'http://user@example.com' must not contain '@' and have
456 >>> try:
457 ... URL("http://" + ("a" * 250))
458 ... except ValueError as ve:
459 ... print(str(ve)[-30:])
460 aaaaa' has invalid length 257.
462 >>> try:
463 ... URL("http://.")
464 ... except ValueError as ve:
465 ... print(ve)
466 URL part '.' contains the forbidden text '.'.
468 >>> try:
469 ... URL("http://..")
470 ... except ValueError as ve:
471 ... print(ve)
472 URL part 'http://..' contains the forbidden text '..'.
474 >>> try:
475 ... URL("http://www.example.com/../1")
476 ... except ValueError as ve:
477 ... print(ve)
478 URL part 'http://www.example.com/../1' contains the forbidden text '/../'.
480 >>> try:
481 ... URL("http://www.example.com/./1")
482 ... except ValueError as ve:
483 ... print(ve)
484 URL part 'http://www.example.com/./1' contains the forbidden text '/./'.
486 >>> try:
487 ... URL("http://user@example.com/@1")
488 ... except ValueError as ve:
489 ... print(str(ve)[:-9])
490 URL part 'http://user@example.com/@1' contains the forbidden text '@exampl
492 >>> try:
493 ... URL("http://:45/1.txt")
494 ... except ValueError as ve:
495 ... print(ve)
496 URL 'http://:45/1.txt' has no host?
498 >>> try:
499 ... URL("http://example.com:-3/@1")
500 ... except ValueError as ve:
501 ... print(ve)
502 Port could not be cast to integer value as '-3'
504 >>> try:
505 ... URL("http://example.com:0/@1")
506 ... except ValueError as ve:
507 ... print(ve)
508 port=0 is invalid, must be in 1..65535.
510 >>> try:
511 ... URL("http://example.com:65536/@1")
512 ... except ValueError as ve:
513 ... print(ve)
514 Port out of range 0-65535
516 >>> try:
517 ... URL(1)
518 ... except TypeError as te:
519 ... print(te)
520 descriptor '__len__' requires a 'str' object but received a 'int'
522 >>> try:
523 ... URL(None)
524 ... except TypeError as te:
525 ... print(te)
526 descriptor '__len__' requires a 'str' object but received a 'NoneType'
528 >>> try:
529 ... URL("http::/1.txt", 1)
530 ... except TypeError as te:
531 ... print(te)
532 descriptor '__len__' requires a 'str' object but received a 'int'
534 >>> try:
535 ... URL("http::/1.txt?x=1")
536 ... except ValueError as ve:
537 ... print(ve)
538 URL part 'http::/1.txt?x=1' contains the forbidden text '?'.
540 >>> try:
541 ... URL("http::/1.txt&x=1")
542 ... except ValueError as ve:
543 ... print(ve)
544 URL part 'http::/1.txt&x=1' contains the forbidden text '&'.
546 >>> try:
547 ... URL("http::/1.+txt&x=1")
548 ... except ValueError as ve:
549 ... print(ve)
550 URL part 'http::/1.+txt&x=1' contains the forbidden text '+'.
552 >>> try:
553 ... URL("http::/1*.+txt&x=1")
554 ... except ValueError as ve:
555 ... print(ve)
556 URL part 'http::/1*.+txt&x=1' contains the forbidden text '*'.
558 >>> try:
559 ... URL("http://example.com#1#2")
560 ... except ValueError as ve:
561 ... print(ve)
562 URL part '1#2' contains the forbidden text '#'.
563 """
565 #: the protocol scheme, e.g., `"https"`
566 scheme: Final[str] # type: ignore
567 #: the network location, usually of the form `"user@host:port"`, i.e.,
568 #: composed of user name (if present), host, and port (if present)
569 netloc: Final[str] # type: ignore
570 #: the host str
571 host: Final[str] # type: ignore
572 #: the port, if any (else `None`)
573 port: Final[int | None] # type: ignore
574 #: the path, if any (else `None`), but without the fragment component
575 path: Final[str | None] # type: ignore
576 #: the path fragment, i.e., the part following a `"#"`, if any (else
577 #: `None`)
578 fragment: Final[str | None] # type: ignore
580 def __new__(cls, value: Any, base_url: Any | None = None):
581 """
582 Create the URL.
584 :param value: either the full absolute URL or a URL that should be
585 resolved against the URL `base_url`
586 :param base_url: the base URL to resolve `value` against, or `None` if
587 `value` is already an absolute URL
588 """
589 if isinstance(value, URL):
590 return cast("URL", value)
592 url: str = _check_url_part(
593 value, _FORBIDDEN_IN_FULL_URL if base_url is None
594 else _FORBIDDEN_IN_RELATIVE_URL)
595 if base_url is not None:
596 url = _check_url_part(urljoin(_check_url_part(
597 base_url, _FORBIDDEN_IN_FULL_URL), url),
598 _FORBIDDEN_IN_FULL_URL)
600 url = url.removesuffix("/")
602 # normalize mailto URLs that do not contain //
603 is_mailto: bool = url.startswith(_MAILTO_2)
604 if is_mailto and (not url.startswith(_MAILTO_3)):
605 url = _MAILTO_3 + url[str.__len__(_MAILTO_2):]
607 res: ParseResult = urlparse(url)
608 scheme: str | None = res.scheme
609 if ((scheme is None) or (str.__len__(scheme) == 0)) and (
610 url.count("@") == 1):
611 res = urlparse(_MAILTO_3 + url)
612 scheme = res.scheme
613 is_mailto = True
614 scheme = _check_url_part(scheme, _FORBIDDEN_IN_FULL_URL)
616 if scheme not in _ALLOWED_SCHEMES:
617 raise ValueError(
618 f"Invalid scheme {scheme!r} of url {url!r} under base "
619 f"{base_url!r}, only {_ALLOWED_SCHEMES!r} are "
620 "permitted.")
622 netloc: Final[str] = _check_url_part(
623 res.netloc, _FORBIDDEN_IN_FULL_URL)
625 host: Final[str] = res.hostname
626 if host is None:
627 raise ValueError(f"URL {url!r} has no host?")
628 _check_url_part(host, _FORBIDDEN_IN_FULL_URL)
629 port: Final[int | None] = res.port
630 if port is not None:
631 check_int_range(port, "port", 1, 65535)
633 path: str | None = res.path
634 if str.__len__(path) > 0:
635 _check_url_part(path, _FORBIDDEN_IN_FULL_URL)
636 else:
637 path = None
639 if is_mailto != (scheme == _MAILTO_1): # this should be impossible
640 raise ValueError(f"url {url!r} has scheme {scheme!r}?")
641 requires_at: Final[bool] = is_mailto or (
642 scheme in _REQUIRE_USER_NAME_SCHEMES)
643 has_at: Final[bool] = "@" in netloc
644 has_user: Final[bool] = (res.username is not None) and (
645 str.__len__(res.username) > 0)
646 if requires_at != (has_at and has_user):
647 raise ValueError(
648 f"{scheme!r} url {url!r} must {'' if requires_at else 'not '}"
649 f"contain '@' and have username, but got "
650 f"{'@' if has_at else 'no @'} and "
651 f"{repr(res.username) if has_user else 'no username'}.")
653 if ((str.__len__(res.query) != 0) or (str.__len__(res.params) != 0)
654 or (res.password is not None)):
655 # should be impossible, as our regex check already picks this up
656 raise ValueError(
657 f"Query/parameters/password found in url {url!r}.")
659 fragment: str | None = res.fragment
660 if str.__len__(fragment) <= 0:
661 fragment = None
662 else:
663 _check_url_part(fragment, _FORBIDDEN_IN_FRAGMENT)
665 result = super().__new__(cls, _check_url_part(
666 res.geturl(), _FORBIDDEN_IN_FULL_URL))
668 #: the protocol scheme
669 result.scheme: Final[str] = scheme # type: ignore
670 #: the network location: user@host:port
671 result.netloc: Final[str] = netloc # type: ignore
672 #: the host
673 result.host: Final[str] = host # type: ignore
674 #: the port, if any (else `None`)
675 result.port: Final[int | None] = port # type: ignore
676 #: the path, if any (else `None`)
677 result.path: Final[str | None] = path # type: ignore
678 #: the path fragment, if any (else `None`)
679 result.fragment: Final[str | None] = fragment # type: ignore
680 return result