Coverage for pycommons / net / url.py: 98%

86 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-11 03:04 +0000

1"""Come string splitting and processing routines.""" 

2 

3from re import Match, search 

4from re import compile as _compile 

5from typing import ( 

6 Any, 

7 Final, 

8 Pattern, 

9 cast, 

10) 

11from urllib.parse import ParseResult, urljoin, urlparse 

12 

13# noinspection PyPackageRequirements 

14from pycommons.strings.chars import WHITESPACE_OR_NEWLINE 

15from pycommons.types import check_int_range 

16 

17#: text that is forbidden in a URL 

18_FORBIDDEN_IN_RELATIVE_URL: Final[Pattern] = _compile( 

19 f"@.*@|[{WHITESPACE_OR_NEWLINE}" 

20 r"\\%*?&+\"'=$§!,;|<>\[\](){}²³°^]+|://.*://") 

21 

22#: text that is forbidden in a fully-expanded URL 

23_FORBIDDEN_IN_FULL_URL: Final[Pattern] = _compile( 

24 _FORBIDDEN_IN_RELATIVE_URL.pattern + r"|\.\.|\/\.+\/|\A\.+\Z") 

25 

26#: text that is forbidden in a fragment 

27_FORBIDDEN_IN_FRAGMENT: Final[Pattern] = _compile( 

28 _FORBIDDEN_IN_FULL_URL.pattern + r"|#") 

29 

30 

31def _check_url_part(part: Any, forbidden: Pattern) -> str: 

32 """ 

33 Check an url part. 

34 

35 :param part: the part 

36 :param forbidden: the pattern of forbidden text 

37 :returns: the url as str 

38 

39 >>> try: 

40 ... _check_url_part("", _FORBIDDEN_IN_RELATIVE_URL) 

41 ... except ValueError as ve: 

42 ... print(ve) 

43 URL part '' has invalid length 0. 

44 

45 >>> try: 

46 ... _check_url_part(" ", _FORBIDDEN_IN_RELATIVE_URL) 

47 ... except ValueError as ve: 

48 ... print(ve) 

49 URL part ' ' contains the forbidden text ' '. 

50 

51 >>> try: 

52 ... _check_url_part("Äquator", _FORBIDDEN_IN_RELATIVE_URL) 

53 ... except ValueError as ve: 

54 ... print(ve) 

55 URL part 'Äquator' contains non-ASCII characters. 

56 

57 >>> try: 

58 ... _check_url_part("2" * 260, _FORBIDDEN_IN_RELATIVE_URL) 

59 ... except ValueError as ve: 

60 ... print(str(ve)[:60]) 

61 URL part '22222222222222222222222222222222222222222222222222 

62 

63 >>> try: 

64 ... _check_url_part(None, _FORBIDDEN_IN_RELATIVE_URL) 

65 ... except TypeError as te: 

66 ... print(te) 

67 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

68 

69 >>> try: 

70 ... _check_url_part(2, _FORBIDDEN_IN_RELATIVE_URL) 

71 ... except TypeError as te: 

72 ... print(te) 

73 descriptor '__len__' requires a 'str' object but received a 'int' 

74 

75 >>> isinstance(_check_url_part("123", _FORBIDDEN_IN_RELATIVE_URL), str) 

76 True 

77 

78 >>> try: 

79 ... _check_url_part(3, _FORBIDDEN_IN_RELATIVE_URL) 

80 ... except TypeError as te: 

81 ... print(te) 

82 descriptor '__len__' requires a 'str' object but received a 'int' 

83 

84 >>> try: 

85 ... _check_url_part("3", 5) 

86 ... except TypeError as te: 

87 ... print(te) 

88 first argument must be string or compiled pattern 

89 """ 

90 if not (0 < str.__len__(part) < 255): 

91 raise ValueError(f"URL part {part!r} has invalid length {len(part)}.") 

92 the_match: Final[Match | None] = search(forbidden, part) 

93 if the_match is not None: 

94 raise ValueError(f"URL part {part!r} contains the forbidden " 

95 f"text {the_match.group()!r}.") 

96 urlstr: Final[str] = cast("str", part) 

97 if not urlstr.isascii(): 

98 raise ValueError( 

99 f"URL part {urlstr!r} contains non-ASCII characters.") 

100 if urlstr.endswith(("#", "@")): 

101 raise ValueError( 

102 f"URL part must not end in {urlstr[-1]!r}, but {urlstr!r} does.") 

103 return urlstr 

104 

105 

106#: the mailto scheme 

107_MAILTO_1: Final[str] = "mailto" 

108#: the mailto prefix 

109_MAILTO_2: Final[str] = _MAILTO_1 + ":" 

110#: the mailto full prefix 

111_MAILTO_3: Final[str] = _MAILTO_2 + "//" 

112#: the ssh scheme 

113_SSH: Final[str] = "ssh" 

114 

115#: the schemes that require usernames 

116_REQUIRE_USER_NAME_SCHEMES: Final[set] = {_MAILTO_1, _SSH} 

117 

118#: the permitted URL schemes without '@' 

119_ALLOWED_SCHEMES: Final[set] = {"http", "https"}.union( 

120 _REQUIRE_USER_NAME_SCHEMES) 

121 

122 

123class URL(str): # noqa: SLOT000 

124 r""" 

125 A normalized and expanded URL. 

126 

127 This is a very strict URL parsing routine. The idea is that it will only 

128 produce URLs that are safe for use in almost any environment and throw 

129 exceptions otherwise. 

130 

131 We limit the URLs to very few different types and allowed schemes. 

132 Non-ASCII characters are not allowed, and neither are spaces, `'%'`, 

133 `'*'`, `'?'`, `'+'`, `'&'`, `'<'`, `'>'`, `','`, `'$'`, `'§'`, `"'"`, 

134 `'"'`, `'['`, `']'`, `'{'`, `'}'`, `'('`, `')'`, ` nor `'\'` and a few 

135 more. 

136 

137 We also allow `'@'` to occur at most once. This means that URLs cannot 

138 have any parameters and also that URL-escaping non-ASCII characters is not 

139 possible either. We thus limit the URLs to mainly static content pointers. 

140 

141 We also only permit simple schemes such as `http`, `https`, `mailto`, and 

142 `ssh`. 

143 

144 The final URL also cannot contain any `'/./'` or `'/../'` or consist of 

145 any component that equals `'..'`. No URL or component must be longer than 

146 255 characters either. It is also not allowed that `'://'` occurs twice. 

147 If the URL is a `mailto` or `ssh` URL, it must provide a username 

148 component. 

149 

150 If a port is provided, it must be greater than 0 and less than 65536. 

151 If a port is specified, a host must be specified as well. 

152 Only if a netloc is found, then a port or a host may be specified. 

153 

154 The URL `value` may be a relative URL that is turned into an absolute URL 

155 using the base URL `base_url`. Of course, then the same restrictions apply 

156 to the relative original URL, the base URL, and the final absolute URL. 

157 

158 This function tries to detect email addresses and turns them into valid 

159 `mailto://` urls. 

160 This function gobbles up single trailing `/` characters. 

161 

162 An instance of `URL` is also an instance of :class:`str`, so you can use 

163 it as string whereever you want. It additionally offers the following 

164 attributes: 

165 

166 - :attr:`~URL.scheme`: the URL scheme, e.g., `"http"` 

167 - :attr:`~URL.netloc`: the URL network location, including user (if any), 

168 host, and port (if any) 

169 - :attr:`~URL.host`: the host of the URL 

170 - :attr:`~URL.port`: the port of the URL, or `None` if no port is 

171 specified 

172 - :attr:`~URL.path`: the path part of the URL (without the 

173 :attr:`~URL.fragment` part, if any), or `None` if no path part is 

174 specified 

175 - :attr:`~URL.fragment`: the fragment part of the path, or `None` if the 

176 path has no fragment 

177 

178 

179 >>> u1 = URL("mailto:tweise@hfuu.edu.cn") 

180 >>> print(u1) 

181 mailto://tweise@hfuu.edu.cn 

182 >>> print(u1.scheme) 

183 mailto 

184 >>> print(u1.netloc) 

185 tweise@hfuu.edu.cn 

186 >>> print(u1.host) 

187 hfuu.edu.cn 

188 >>> print(u1.port) 

189 None 

190 >>> print(u1.path) 

191 None 

192 >>> print(u1.fragment) 

193 None 

194 

195 >>> u = URL("tweise@hfuu.edu.cn") 

196 >>> print(u) 

197 mailto://tweise@hfuu.edu.cn 

198 >>> print(u.scheme) 

199 mailto 

200 >>> print(u.netloc) 

201 tweise@hfuu.edu.cn 

202 >>> print(u.host) 

203 hfuu.edu.cn 

204 >>> print(u.port) 

205 None 

206 >>> print(u.path) 

207 None 

208 >>> print(u.fragment) 

209 None 

210 

211 >>> URL("mailto://tweise@hfuu.edu.cn") 

212 'mailto://tweise@hfuu.edu.cn' 

213 

214 >>> u2 = URL("https://example.com/abc") 

215 >>> print(u2) 

216 https://example.com/abc 

217 >>> print(u2.scheme) 

218 https 

219 >>> print(u2.netloc) 

220 example.com 

221 >>> print(u2.host) 

222 example.com 

223 >>> print(u2.port) 

224 None 

225 >>> print(u2.path) 

226 /abc 

227 >>> print(u2.fragment) 

228 None 

229 >>> u1.host != u2.host 

230 True 

231 

232 >>> u = URL("https://example.com/abc/") 

233 >>> print(u) 

234 https://example.com/abc 

235 >>> print(u.scheme) 

236 https 

237 >>> print(u.netloc) 

238 example.com 

239 >>> print(u.host) 

240 example.com 

241 >>> print(u.port) 

242 None 

243 >>> print(u.path) 

244 /abc 

245 >>> print(u.fragment) 

246 None 

247 

248 >>> u = URL("https://example.com/") 

249 >>> print(u) 

250 https://example.com 

251 >>> print(u.scheme) 

252 https 

253 >>> print(u.netloc) 

254 example.com 

255 >>> print(u.host) 

256 example.com 

257 >>> print(u.port) 

258 None 

259 >>> print(u.path) 

260 None 

261 >>> print(u.fragment) 

262 None 

263 

264 >>> u = URL("ssh://git@example.com/abc") 

265 >>> print(u) 

266 ssh://git@example.com/abc 

267 >>> print(u.scheme) 

268 ssh 

269 >>> print(u.netloc) 

270 git@example.com 

271 >>> print(u.host) 

272 example.com 

273 >>> print(u.port) 

274 None 

275 >>> print(u.path) 

276 /abc 

277 >>> print(u.fragment) 

278 None 

279 

280 >>> URL("1.txt", "http://example.com/thomasWeise") 

281 'http://example.com/1.txt' 

282 

283 >>> URL("1.txt", "http://example.com/thomasWeise/") 

284 'http://example.com/thomasWeise/1.txt' 

285 

286 >>> URL("../1.txt", "http://example.com/thomasWeise/") 

287 'http://example.com/1.txt' 

288 

289 >>> URL("https://example.com/1.txt", 

290 ... "http://github.com/thomasWeise/") 

291 'https://example.com/1.txt' 

292 

293 >>> URL("http://example.com:123/1") 

294 'http://example.com:123/1' 

295 

296 >>> u = URL("http://example.com:34/index.html#1") 

297 >>> print(u) 

298 http://example.com:34/index.html#1 

299 >>> print(u.scheme) 

300 http 

301 >>> print(u.netloc) 

302 example.com:34 

303 >>> print(u.host) 

304 example.com 

305 >>> print(u.port) 

306 34 

307 >>> print(u.path) 

308 /index.html 

309 >>> print(u.fragment) 

310 1 

311 

312 >>> try: 

313 ... URL("tweise@@hfuu.edu.cn") 

314 ... except ValueError as ve: 

315 ... print(ve) 

316 URL part 'tweise@@hfuu.edu.cn' contains the forbidden text '@@'. 

317 

318 >>> try: 

319 ... URL("http://example.com/index.html#") 

320 ... except ValueError as ve: 

321 ... print(ve) 

322 URL part must not end in '#', but 'http://example.com/index.html#' does. 

323 

324 >>> try: 

325 ... URL("http://example.com/index.html@") 

326 ... except ValueError as ve: 

327 ... print(ve) 

328 URL part must not end in '@', but 'http://example.com/index.html@' does. 

329 

330 >>> try: 

331 ... URL("https://example.com/abc(/23") 

332 ... except ValueError as ve: 

333 ... print(ve) 

334 URL part 'https://example.com/abc(/23' contains the forbidden text '('. 

335 

336 >>> try: 

337 ... URL("https://example.com/abc]/23") 

338 ... except ValueError as ve: 

339 ... print(ve) 

340 URL part 'https://example.com/abc]/23' contains the forbidden text ']'. 

341 

342 >>> try: 

343 ... URL("https://example.com/abcä/23") 

344 ... except ValueError as ve: 

345 ... print(ve) 

346 URL part 'https://example.com/abcä/23' contains non-ASCII characters. 

347 

348 >>> try: 

349 ... URL("https://example.com/abc/./23") 

350 ... except ValueError as ve: 

351 ... print(ve) 

352 URL part 'https://example.com/abc/./23' contains the forbidden text '/./'. 

353 

354 >>> try: 

355 ... URL("https://example.com/abc/../1.txt") 

356 ... except ValueError as ve: 

357 ... print(str(ve)[:-4]) 

358 URL part 'https://example.com/abc/../1.txt' contains the forbidden text '/. 

359 

360 >>> try: 

361 ... URL(r"https://example.com/abc\./23") 

362 ... except ValueError as ve: 

363 ... print(ve) 

364 URL part 'https://example.com/abc\\./23' contains the forbidden text '\\'. 

365 

366 >>> try: 

367 ... URL("https://1.2.com/abc/23/../r") 

368 ... except ValueError as ve: 

369 ... print(ve) 

370 URL part 'https://1.2.com/abc/23/../r' contains the forbidden text '/../'. 

371 

372 >>> try: 

373 ... URL("https://exa mple.com") 

374 ... except ValueError as ve: 

375 ... print(ve) 

376 URL part 'https://exa mple.com' contains the forbidden text ' '. 

377 

378 >>> try: 

379 ... URL("ftp://example.com") 

380 ... except ValueError as ve: 

381 ... print(str(ve)[:66]) 

382 Invalid scheme 'ftp' of url 'ftp://example.com' under base None, o 

383 

384 >>> try: 

385 ... URL("http://example.com%32") 

386 ... except ValueError as ve: 

387 ... print(str(ve)) 

388 URL part 'http://example.com%32' contains the forbidden text '%'. 

389 

390 >>> try: 

391 ... URL("mailto://example.com") 

392 ... except ValueError as ve: 

393 ... print(str(ve)[:66]) 

394 'mailto' url 'mailto://example.com' must contain '@' and have user 

395 

396 >>> try: 

397 ... URL("ssh://example.com") 

398 ... except ValueError as ve: 

399 ... print(str(ve)[:65]) 

400 'ssh' url 'ssh://example.com' must contain '@' and have username, 

401 

402 >>> try: 

403 ... URL("ftp://example.com*32") 

404 ... except ValueError as ve: 

405 ... print(str(ve)) 

406 URL part 'ftp://example.com*32' contains the forbidden text '*'. 

407 

408 >>> try: 

409 ... URL("http://example.com/https://h") 

410 ... except ValueError as ve: 

411 ... print(str(ve)[:74]) 

412 URL part 'http://example.com/https://h' contains the forbidden text '://ex 

413 

414 >>> try: 

415 ... URL("http://user@example.com") 

416 ... except ValueError as ve: 

417 ... print(str(ve)[:66]) 

418 'http' url 'http://user@example.com' must not contain '@' and have 

419 

420 >>> try: 

421 ... URL("http://" + ("a" * 250)) 

422 ... except ValueError as ve: 

423 ... print(str(ve)[-30:]) 

424 aaaaa' has invalid length 257. 

425 

426 >>> try: 

427 ... URL("http://.") 

428 ... except ValueError as ve: 

429 ... print(ve) 

430 URL part '.' contains the forbidden text '.'. 

431 

432 >>> try: 

433 ... URL("http://..") 

434 ... except ValueError as ve: 

435 ... print(ve) 

436 URL part 'http://..' contains the forbidden text '..'. 

437 

438 >>> try: 

439 ... URL("http://www.example.com/../1") 

440 ... except ValueError as ve: 

441 ... print(ve) 

442 URL part 'http://www.example.com/../1' contains the forbidden text '/../'. 

443 

444 >>> try: 

445 ... URL("http://www.example.com/./1") 

446 ... except ValueError as ve: 

447 ... print(ve) 

448 URL part 'http://www.example.com/./1' contains the forbidden text '/./'. 

449 

450 >>> try: 

451 ... URL("http://user@example.com/@1") 

452 ... except ValueError as ve: 

453 ... print(str(ve)[:-9]) 

454 URL part 'http://user@example.com/@1' contains the forbidden text '@exampl 

455 

456 >>> try: 

457 ... URL("http://:45/1.txt") 

458 ... except ValueError as ve: 

459 ... print(ve) 

460 URL 'http://:45/1.txt' has no host? 

461 

462 >>> try: 

463 ... URL("http://example.com:-3/@1") 

464 ... except ValueError as ve: 

465 ... print(ve) 

466 Port could not be cast to integer value as '-3' 

467 

468 >>> try: 

469 ... URL("http://example.com:0/@1") 

470 ... except ValueError as ve: 

471 ... print(ve) 

472 port=0 is invalid, must be in 1..65535. 

473 

474 >>> try: 

475 ... URL("http://example.com:65536/@1") 

476 ... except ValueError as ve: 

477 ... print(ve) 

478 Port out of range 0-65535 

479 

480 >>> try: 

481 ... URL(1) 

482 ... except TypeError as te: 

483 ... print(te) 

484 descriptor '__len__' requires a 'str' object but received a 'int' 

485 

486 >>> try: 

487 ... URL(None) 

488 ... except TypeError as te: 

489 ... print(te) 

490 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

491 

492 >>> try: 

493 ... URL("http::/1.txt", 1) 

494 ... except TypeError as te: 

495 ... print(te) 

496 descriptor '__len__' requires a 'str' object but received a 'int' 

497 

498 >>> try: 

499 ... URL("http::/1.txt?x=1") 

500 ... except ValueError as ve: 

501 ... print(ve) 

502 URL part 'http::/1.txt?x=1' contains the forbidden text '?'. 

503 

504 >>> try: 

505 ... URL("http::/1.txt&x=1") 

506 ... except ValueError as ve: 

507 ... print(ve) 

508 URL part 'http::/1.txt&x=1' contains the forbidden text '&'. 

509 

510 >>> try: 

511 ... URL("http::/1.+txt&x=1") 

512 ... except ValueError as ve: 

513 ... print(ve) 

514 URL part 'http::/1.+txt&x=1' contains the forbidden text '+'. 

515 

516 >>> try: 

517 ... URL("http::/1*.+txt&x=1") 

518 ... except ValueError as ve: 

519 ... print(ve) 

520 URL part 'http::/1*.+txt&x=1' contains the forbidden text '*'. 

521 

522 >>> try: 

523 ... URL("http://example.com#1#2") 

524 ... except ValueError as ve: 

525 ... print(ve) 

526 URL part '1#2' contains the forbidden text '#'. 

527 """ 

528 

529 #: the protocol scheme, e.g., `"https"` 

530 scheme: Final[str] # type: ignore 

531 #: the network location, usually of the form `"user@host:port"`, i.e., 

532 #: composed of user name (if present), host, and port (if present) 

533 netloc: Final[str] # type: ignore 

534 #: the host str 

535 host: Final[str] # type: ignore 

536 #: the port, if any (else `None`) 

537 port: Final[int | None] # type: ignore 

538 #: the path, if any (else `None`), but without the fragment component 

539 path: Final[str | None] # type: ignore 

540 #: the path fragment, i.e., the part following a `"#"`, if any (else 

541 #: `None`) 

542 fragment: Final[str | None] # type: ignore 

543 

544 def __new__(cls, value: Any, base_url: Any | None = None): 

545 """ 

546 Create the URL. 

547 

548 :param value: either the full absolute URL or a URL that should be 

549 resolved against the URL `base_url` 

550 :param base_url: the base URL to resolve `value` against, or `None` if 

551 `value` is already an absolute URL 

552 """ 

553 if isinstance(value, URL): 

554 return cast("URL", value) 

555 

556 url: str = _check_url_part( 

557 value, _FORBIDDEN_IN_FULL_URL if base_url is None 

558 else _FORBIDDEN_IN_RELATIVE_URL) 

559 if base_url is not None: 

560 url = _check_url_part(urljoin(_check_url_part( 

561 base_url, _FORBIDDEN_IN_FULL_URL), url), 

562 _FORBIDDEN_IN_FULL_URL) 

563 

564 url = url.removesuffix("/") 

565 

566 # normalize mailto URLs that do not contain // 

567 is_mailto: bool = url.startswith(_MAILTO_2) 

568 if is_mailto and (not url.startswith(_MAILTO_3)): 

569 url = _MAILTO_3 + url[str.__len__(_MAILTO_2):] 

570 

571 res: ParseResult = urlparse(url) 

572 scheme: str | None = res.scheme 

573 if ((scheme is None) or (str.__len__(scheme) == 0)) and ( 

574 url.count("@") == 1): 

575 res = urlparse(_MAILTO_3 + url) 

576 scheme = res.scheme 

577 is_mailto = True 

578 scheme = _check_url_part(scheme, _FORBIDDEN_IN_FULL_URL) 

579 

580 if scheme not in _ALLOWED_SCHEMES: 

581 raise ValueError( 

582 f"Invalid scheme {scheme!r} of url {url!r} under base " 

583 f"{base_url!r}, only {_ALLOWED_SCHEMES!r} are " 

584 "permitted.") 

585 

586 netloc: Final[str] = _check_url_part( 

587 res.netloc, _FORBIDDEN_IN_FULL_URL) 

588 

589 host: Final[str] = res.hostname 

590 if host is None: 

591 raise ValueError(f"URL {url!r} has no host?") 

592 _check_url_part(host, _FORBIDDEN_IN_FULL_URL) 

593 port: Final[int | None] = res.port 

594 if port is not None: 

595 check_int_range(port, "port", 1, 65535) 

596 

597 path: str | None = res.path 

598 if str.__len__(path) > 0: 

599 _check_url_part(path, _FORBIDDEN_IN_FULL_URL) 

600 else: 

601 path = None 

602 

603 if is_mailto != (scheme == _MAILTO_1): # this should be impossible 

604 raise ValueError(f"url {url!r} has scheme {scheme!r}?") 

605 requires_at: Final[bool] = is_mailto or ( 

606 scheme in _REQUIRE_USER_NAME_SCHEMES) 

607 has_at: Final[bool] = "@" in netloc 

608 has_user: Final[bool] = (res.username is not None) and ( 

609 str.__len__(res.username) > 0) 

610 if requires_at != (has_at and has_user): 

611 raise ValueError( 

612 f"{scheme!r} url {url!r} must {'' if requires_at else 'not '}" 

613 f"contain '@' and have username, but got " 

614 f"{'@' if has_at else 'no @'} and " 

615 f"{repr(res.username) if has_user else 'no username'}.") 

616 

617 if ((str.__len__(res.query) != 0) or (str.__len__(res.params) != 0) 

618 or (res.password is not None)): 

619 # should be impossible, as our regex check already picks this up 

620 raise ValueError( 

621 f"Query/parameters/password found in url {url!r}.") 

622 

623 fragment: str | None = res.fragment 

624 if str.__len__(fragment) <= 0: 

625 fragment = None 

626 else: 

627 _check_url_part(fragment, _FORBIDDEN_IN_FRAGMENT) 

628 

629 result = super().__new__(cls, _check_url_part( 

630 res.geturl(), _FORBIDDEN_IN_FULL_URL)) 

631 

632 #: the protocol scheme 

633 result.scheme: Final[str] = scheme # type: ignore 

634 #: the network location: user@host:port 

635 result.netloc: Final[str] = netloc # type: ignore 

636 #: the host 

637 result.host: Final[str] = host # type: ignore 

638 #: the port, if any (else `None`) 

639 result.port: Final[int | None] = port # type: ignore 

640 #: the path, if any (else `None`) 

641 result.path: Final[str | None] = path # type: ignore 

642 #: the path fragment, if any (else `None`) 

643 result.fragment: Final[str | None] = fragment # type: ignore 

644 return result