Coverage for pycommons/net/url.py: 98%

1"""Come string splitting and processing routines."""

3from re import Match, search

4from re import compile as _compile

5from typing import (

6 Any,

7 Final,

8 Pattern,

9 cast,

10)

11from urllib.parse import ParseResult, urljoin, urlparse

13# noinspection PyPackageRequirements

14from pycommons.strings.chars import WHITESPACE_OR_NEWLINE

15from pycommons.types import check_int_range

17#: text that is forbidden in a URL

18_FORBIDDEN_IN_RELATIVE_URL: Final[Pattern] = _compile(

19 f"@.*@|[{WHITESPACE_OR_NEWLINE}"

20 r"\\%*?&+\"'=$§!,;|<>\[\](){}²³°^]+|://.*://")

22#: text that is forbidden in a fully-expanded URL

23_FORBIDDEN_IN_FULL_URL: Final[Pattern] = _compile(

24 _FORBIDDEN_IN_RELATIVE_URL.pattern + r"|\.\.|\/\.+\/|\A\.+\Z")

26#: text that is forbidden in a fragment

27_FORBIDDEN_IN_FRAGMENT: Final[Pattern] = _compile(

28 _FORBIDDEN_IN_FULL_URL.pattern + r"|#")

31def _check_url_part(part: Any, forbidden: Pattern) -> str:

32 """

33 Check an url part.

35 :param part: the part

36 :param forbidden: the pattern of forbidden text

37 :returns: the url as str

39 >>> try:

40 ... _check_url_part("", _FORBIDDEN_IN_RELATIVE_URL)

41 ... except ValueError as ve:

42 ... print(ve)

43 URL part '' has invalid length 0.

45 >>> try:

46 ... _check_url_part(" ", _FORBIDDEN_IN_RELATIVE_URL)

47 ... except ValueError as ve:

48 ... print(ve)

49 URL part ' ' contains the forbidden text ' '.

51 >>> try:

52 ... _check_url_part("Äquator", _FORBIDDEN_IN_RELATIVE_URL)

53 ... except ValueError as ve:

54 ... print(ve)

55 URL part 'Äquator' contains non-ASCII characters.

57 >>> try:

58 ... _check_url_part("2" * 260, _FORBIDDEN_IN_RELATIVE_URL)

59 ... except ValueError as ve:

60 ... print(str(ve)[:60])

61 URL part '22222222222222222222222222222222222222222222222222

63 >>> try:

64 ... _check_url_part(None, _FORBIDDEN_IN_RELATIVE_URL)

65 ... except TypeError as te:

66 ... print(te)

67 descriptor '__len__' requires a 'str' object but received a 'NoneType'

69 >>> try:

70 ... _check_url_part(2, _FORBIDDEN_IN_RELATIVE_URL)

71 ... except TypeError as te:

72 ... print(te)

73 descriptor '__len__' requires a 'str' object but received a 'int'

75 >>> isinstance(_check_url_part("123", _FORBIDDEN_IN_RELATIVE_URL), str)

76 True

78 >>> try:

79 ... _check_url_part(3, _FORBIDDEN_IN_RELATIVE_URL)

80 ... except TypeError as te:

81 ... print(te)

82 descriptor '__len__' requires a 'str' object but received a 'int'

84 >>> try:

85 ... _check_url_part("3", 5)

86 ... except TypeError as te:

87 ... print(te)

88 first argument must be string or compiled pattern

89 """

90 if not (0 < str.__len__(part) < 255):

91 raise ValueError(f"URL part {part!r} has invalid length {len(part)}.")

92 the_match: Final[Match | None] = search(forbidden, part)

93 if the_match is not None:

94 raise ValueError(f"URL part {part!r} contains the forbidden "

95 f"text {the_match.group()!r}.")

96 urlstr: Final[str] = cast("str", part)

97 if not urlstr.isascii():

98 raise ValueError(

99 f"URL part {urlstr!r} contains non-ASCII characters.")

100 if urlstr.endswith(("#", "@")):

101 raise ValueError(

102 f"URL part must not end in {urlstr[-1]!r}, but {urlstr!r} does.")

103 return urlstr

104

105

106#: the mailto scheme

107_MAILTO_1: Final[str] = "mailto"

108#: the mailto prefix

109_MAILTO_2: Final[str] = _MAILTO_1 + ":"

110#: the mailto full prefix

111_MAILTO_3: Final[str] = _MAILTO_2 + "//"

112#: the ssh scheme

113_SSH: Final[str] = "ssh"

114

115#: the schemes that require usernames

116_REQUIRE_USER_NAME_SCHEMES: Final[set] = {_MAILTO_1, _SSH}

117

118#: the permitted URL schemes without '@'

119_ALLOWED_SCHEMES: Final[set] = {"http", "https"}.union(

120 _REQUIRE_USER_NAME_SCHEMES)

121

122

123class URL(str): # noqa: SLOT000

124 r"""

125 A normalized and expanded URL.

126

127 This is a very strict URL parsing routine. The idea is that it will only

128 produce URLs that are safe for use in almost any environment and throw

129 exceptions otherwise.

130

131 We limit the URLs to very few different types and allowed schemes.

132 Non-ASCII characters are not allowed, and neither are spaces, `'%'`,

133 `'*'`, `'?'`, `'+'`, `'&'`, `'<'`, `'>'`, `','`, `'$'`, `'§'`, `"'"`,

134 `'"'`, `'['`, `']'`, `'{'`, `'}'`, `'('`, `')'`, ` nor `'\'` and a few

135 more.

136

137 We also allow `'@'` to occur at most once. This means that URLs cannot

138 have any parameters and also that URL-escaping non-ASCII characters is not

139 possible either. We thus limit the URLs to mainly static content pointers.

140

141 We also only permit simple schemes such as `http`, `https`, `mailto`, and

142 `ssh`.

143

144 The final URL also cannot contain any `'/./'` or `'/../'` or consist of

145 any component that equals `'..'`. No URL or component must be longer than

146 255 characters either. It is also not allowed that `'://'` occurs twice.

147 If the URL is a `mailto` or `ssh` URL, it must provide a username

148 component.

149

150 If a port is provided, it must be greater than 0 and less than 65536.

151 If a port is specified, a host must be specified as well.

152 Only if a netloc is found, then a port or a host may be specified.

153

154 The URL `value` may be a relative URL that is turned into an absolute URL

155 using the base URL `base_url`. Of course, then the same restrictions apply

156 to the relative original URL, the base URL, and the final absolute URL.

157

158 This function tries to detect email addresses and turns them into valid

159 `mailto://` urls.

160 This function gobbles up single trailing `/` characters.

161

162 An instance of `URL` is also an instance of :class:`str`, so you can use

163 it as string whereever you want. It additionally offers the following

164 attributes:

165

166 - :attr:`~URL.scheme`: the URL scheme, e.g., `"http"`

167 - :attr:`~URL.netloc`: the URL network location, including user (if any),

168 host, and port (if any)

169 - :attr:`~URL.host`: the host of the URL

170 - :attr:`~URL.port`: the port of the URL, or `None` if no port is

171 specified

172 - :attr:`~URL.path`: the path part of the URL (without the

173 :attr:`~URL.fragment` part, if any), or `None` if no path part is

174 specified

175 - :attr:`~URL.fragment`: the fragment part of the path, or `None` if the

176 path has no fragment

177

178

179 >>> u1 = URL("mailto:tweise@hfuu.edu.cn")

180 >>> print(u1)

181 mailto://tweise@hfuu.edu.cn

182 >>> print(u1.scheme)

183 mailto

184 >>> print(u1.netloc)

185 tweise@hfuu.edu.cn

186 >>> print(u1.host)

187 hfuu.edu.cn

188 >>> print(u1.port)

189 None

190 >>> print(u1.path)

191 None

192 >>> print(u1.fragment)

193 None

194

195 >>> u = URL("tweise@hfuu.edu.cn")

196 >>> print(u)

197 mailto://tweise@hfuu.edu.cn

198 >>> print(u.scheme)

199 mailto

200 >>> print(u.netloc)

201 tweise@hfuu.edu.cn

202 >>> print(u.host)

203 hfuu.edu.cn

204 >>> print(u.port)

205 None

206 >>> print(u.path)

207 None

208 >>> print(u.fragment)

209 None

210

211 >>> URL("mailto://tweise@hfuu.edu.cn")

212 'mailto://tweise@hfuu.edu.cn'

213

214 >>> u2 = URL("https://example.com/abc")

215 >>> print(u2)

216 https://example.com/abc

217 >>> print(u2.scheme)

218 https

219 >>> print(u2.netloc)

220 example.com

221 >>> print(u2.host)

222 example.com

223 >>> print(u2.port)

224 None

225 >>> print(u2.path)

226 /abc

227 >>> print(u2.fragment)

228 None

229 >>> u1.host != u2.host

230 True

231

232 >>> u = URL("https://example.com/abc/")

233 >>> print(u)

234 https://example.com/abc

235 >>> print(u.scheme)

236 https

237 >>> print(u.netloc)

238 example.com

239 >>> print(u.host)

240 example.com

241 >>> print(u.port)

242 None

243 >>> print(u.path)

244 /abc

245 >>> print(u.fragment)

246 None

247

248 >>> u = URL("https://example.com/")

249 >>> print(u)

250 https://example.com

251 >>> print(u.scheme)

252 https

253 >>> print(u.netloc)

254 example.com

255 >>> print(u.host)

256 example.com

257 >>> print(u.port)

258 None

259 >>> print(u.path)

260 None

261 >>> print(u.fragment)

262 None

263

264 >>> u = URL("ssh://git@example.com/abc")

265 >>> print(u)

266 ssh://git@example.com/abc

267 >>> print(u.scheme)

268 ssh

269 >>> print(u.netloc)

270 git@example.com

271 >>> print(u.host)

272 example.com

273 >>> print(u.port)

274 None

275 >>> print(u.path)

276 /abc

277 >>> print(u.fragment)

278 None

279

280 >>> URL("1.txt", "http://example.com/thomasWeise")

281 'http://example.com/1.txt'

282

283 >>> URL("1.txt", "http://example.com/thomasWeise/")

284 'http://example.com/thomasWeise/1.txt'

285

286 >>> URL("../1.txt", "http://example.com/thomasWeise/")

287 'http://example.com/1.txt'

288

289 >>> URL("https://example.com/1.txt",

290 ... "http://github.com/thomasWeise/")

291 'https://example.com/1.txt'

292

293 >>> URL("http://example.com:123/1")

294 'http://example.com:123/1'

295

296 >>> u = URL("http://example.com:34/index.html#1")

297 >>> print(u)

298 http://example.com:34/index.html#1

299 >>> print(u.scheme)

300 http

301 >>> print(u.netloc)

302 example.com:34

303 >>> print(u.host)

304 example.com

305 >>> print(u.port)

306 34

307 >>> print(u.path)

308 /index.html

309 >>> print(u.fragment)

310 1

311

312 >>> try:

313 ... URL("tweise@@hfuu.edu.cn")

314 ... except ValueError as ve:

315 ... print(ve)

316 URL part 'tweise@@hfuu.edu.cn' contains the forbidden text '@@'.

317

318 >>> try:

319 ... URL("http://example.com/index.html#")

320 ... except ValueError as ve:

321 ... print(ve)

322 URL part must not end in '#', but 'http://example.com/index.html#' does.

323

324 >>> try:

325 ... URL("http://example.com/index.html@")

326 ... except ValueError as ve:

327 ... print(ve)

328 URL part must not end in '@', but 'http://example.com/index.html@' does.

329

330 >>> try:

331 ... URL("https://example.com/abc(/23")

332 ... except ValueError as ve:

333 ... print(ve)

334 URL part 'https://example.com/abc(/23' contains the forbidden text '('.

335

336 >>> try:

337 ... URL("https://example.com/abc]/23")

338 ... except ValueError as ve:

339 ... print(ve)

340 URL part 'https://example.com/abc]/23' contains the forbidden text ']'.

341

342 >>> try:

343 ... URL("https://example.com/abcä/23")

344 ... except ValueError as ve:

345 ... print(ve)

346 URL part 'https://example.com/abcä/23' contains non-ASCII characters.

347

348 >>> try:

349 ... URL("https://example.com/abc/./23")

350 ... except ValueError as ve:

351 ... print(ve)

352 URL part 'https://example.com/abc/./23' contains the forbidden text '/./'.

353

354 >>> try:

355 ... URL("https://example.com/abc/../1.txt")

356 ... except ValueError as ve:

357 ... print(str(ve)[:-4])

358 URL part 'https://example.com/abc/../1.txt' contains the forbidden text '/.

359

360 >>> try:

361 ... URL(r"https://example.com/abc\./23")

362 ... except ValueError as ve:

363 ... print(ve)

364 URL part 'https://example.com/abc\\./23' contains the forbidden text '\\'.

365

366 >>> try:

367 ... URL("https://1.2.com/abc/23/../r")

368 ... except ValueError as ve:

369 ... print(ve)

370 URL part 'https://1.2.com/abc/23/../r' contains the forbidden text '/../'.

371

372 >>> try:

373 ... URL("https://exa mple.com")

374 ... except ValueError as ve:

375 ... print(ve)

376 URL part 'https://exa mple.com' contains the forbidden text ' '.

377

378 >>> try:

379 ... URL("ftp://example.com")

380 ... except ValueError as ve:

381 ... print(str(ve)[:66])

382 Invalid scheme 'ftp' of url 'ftp://example.com' under base None, o

383

384 >>> try:

385 ... URL("http://example.com%32")

386 ... except ValueError as ve:

387 ... print(str(ve))

388 URL part 'http://example.com%32' contains the forbidden text '%'.

389

390 >>> try:

391 ... URL("mailto://example.com")

392 ... except ValueError as ve:

393 ... print(str(ve)[:66])

394 'mailto' url 'mailto://example.com' must contain '@' and have user

395

396 >>> try:

397 ... URL("ssh://example.com")

398 ... except ValueError as ve:

399 ... print(str(ve)[:65])

400 'ssh' url 'ssh://example.com' must contain '@' and have username,

401

402 >>> try:

403 ... URL("ftp://example.com*32")

404 ... except ValueError as ve:

405 ... print(str(ve))

406 URL part 'ftp://example.com*32' contains the forbidden text '*'.

407

408 >>> try:

409 ... URL("http://example.com/https://h")

410 ... except ValueError as ve:

411 ... print(str(ve)[:74])

412 URL part 'http://example.com/https://h' contains the forbidden text '://ex

413

414 >>> try:

415 ... URL("http://user@example.com")

416 ... except ValueError as ve:

417 ... print(str(ve)[:66])

418 'http' url 'http://user@example.com' must not contain '@' and have

419

420 >>> try:

421 ... URL("http://" + ("a" * 250))

422 ... except ValueError as ve:

423 ... print(str(ve)[-30:])

424 aaaaa' has invalid length 257.

425

426 >>> try:

427 ... URL("http://.")

428 ... except ValueError as ve:

429 ... print(ve)

430 URL part '.' contains the forbidden text '.'.

431

432 >>> try:

433 ... URL("http://..")

434 ... except ValueError as ve:

435 ... print(ve)

436 URL part 'http://..' contains the forbidden text '..'.

437

438 >>> try:

439 ... URL("http://www.example.com/../1")

440 ... except ValueError as ve:

441 ... print(ve)

442 URL part 'http://www.example.com/../1' contains the forbidden text '/../'.

443

444 >>> try:

445 ... URL("http://www.example.com/./1")

446 ... except ValueError as ve:

447 ... print(ve)

448 URL part 'http://www.example.com/./1' contains the forbidden text '/./'.

449

450 >>> try:

451 ... URL("http://user@example.com/@1")

452 ... except ValueError as ve:

453 ... print(str(ve)[:-9])

454 URL part 'http://user@example.com/@1' contains the forbidden text '@exampl

455

456 >>> try:

457 ... URL("http://:45/1.txt")

458 ... except ValueError as ve:

459 ... print(ve)

460 URL 'http://:45/1.txt' has no host?

461

462 >>> try:

463 ... URL("http://example.com:-3/@1")

464 ... except ValueError as ve:

465 ... print(ve)

466 Port could not be cast to integer value as '-3'

467

468 >>> try:

469 ... URL("http://example.com:0/@1")

470 ... except ValueError as ve:

471 ... print(ve)

472 port=0 is invalid, must be in 1..65535.

473

474 >>> try:

475 ... URL("http://example.com:65536/@1")

476 ... except ValueError as ve:

477 ... print(ve)

478 Port out of range 0-65535

479

480 >>> try:

481 ... URL(1)

482 ... except TypeError as te:

483 ... print(te)

484 descriptor '__len__' requires a 'str' object but received a 'int'

485

486 >>> try:

487 ... URL(None)

488 ... except TypeError as te:

489 ... print(te)

490 descriptor '__len__' requires a 'str' object but received a 'NoneType'

491

492 >>> try:

493 ... URL("http::/1.txt", 1)

494 ... except TypeError as te:

495 ... print(te)

496 descriptor '__len__' requires a 'str' object but received a 'int'

497

498 >>> try:

499 ... URL("http::/1.txt?x=1")

500 ... except ValueError as ve:

501 ... print(ve)

502 URL part 'http::/1.txt?x=1' contains the forbidden text '?'.

503

504 >>> try:

505 ... URL("http::/1.txt&x=1")

506 ... except ValueError as ve:

507 ... print(ve)

508 URL part 'http::/1.txt&x=1' contains the forbidden text '&'.

509

510 >>> try:

511 ... URL("http::/1.+txt&x=1")

512 ... except ValueError as ve:

513 ... print(ve)

514 URL part 'http::/1.+txt&x=1' contains the forbidden text '+'.

515

516 >>> try:

517 ... URL("http::/1*.+txt&x=1")

518 ... except ValueError as ve:

519 ... print(ve)

520 URL part 'http::/1*.+txt&x=1' contains the forbidden text '*'.

521

522 >>> try:

523 ... URL("http://example.com#1#2")

524 ... except ValueError as ve:

525 ... print(ve)

526 URL part '1#2' contains the forbidden text '#'.

527 """

528

529 #: the protocol scheme, e.g., `"https"`

530 scheme: Final[str] # type: ignore

531 #: the network location, usually of the form `"user@host:port"`, i.e.,

532 #: composed of user name (if present), host, and port (if present)

533 netloc: Final[str] # type: ignore

534 #: the host str

535 host: Final[str] # type: ignore

536 #: the port, if any (else `None`)

537 port: Final[int | None] # type: ignore

538 #: the path, if any (else `None`), but without the fragment component

539 path: Final[str | None] # type: ignore

540 #: the path fragment, i.e., the part following a `"#"`, if any (else

541 #: `None`)

542 fragment: Final[str | None] # type: ignore

543

544 def __new__(cls, value: Any, base_url: Any | None = None):

545 """

546 Create the URL.

547

548 :param value: either the full absolute URL or a URL that should be

549 resolved against the URL `base_url`

550 :param base_url: the base URL to resolve `value` against, or `None` if

551 `value` is already an absolute URL

552 """

553 if isinstance(value, URL):

554 return cast("URL", value)

555

556 url: str = _check_url_part(

557 value, _FORBIDDEN_IN_FULL_URL if base_url is None

558 else _FORBIDDEN_IN_RELATIVE_URL)

559 if base_url is not None:

560 url = _check_url_part(urljoin(_check_url_part(

561 base_url, _FORBIDDEN_IN_FULL_URL), url),

562 _FORBIDDEN_IN_FULL_URL)

563

564 url = url.removesuffix("/")

565

566 # normalize mailto URLs that do not contain //

567 is_mailto: bool = url.startswith(_MAILTO_2)

568 if is_mailto and (not url.startswith(_MAILTO_3)):

569 url = _MAILTO_3 + url[str.__len__(_MAILTO_2):]

570

571 res: ParseResult = urlparse(url)

572 scheme: str | None = res.scheme

573 if ((scheme is None) or (str.__len__(scheme) == 0)) and (

574 url.count("@") == 1):

575 res = urlparse(_MAILTO_3 + url)

576 scheme = res.scheme

577 is_mailto = True

578 scheme = _check_url_part(scheme, _FORBIDDEN_IN_FULL_URL)

579

580 if scheme not in _ALLOWED_SCHEMES:

581 raise ValueError(

582 f"Invalid scheme {scheme!r} of url {url!r} under base "

583 f"{base_url!r}, only {_ALLOWED_SCHEMES!r} are "

584 "permitted.")

585

586 netloc: Final[str] = _check_url_part(

587 res.netloc, _FORBIDDEN_IN_FULL_URL)

588

589 host: Final[str] = res.hostname

590 if host is None:

591 raise ValueError(f"URL {url!r} has no host?")

592 _check_url_part(host, _FORBIDDEN_IN_FULL_URL)

593 port: Final[int | None] = res.port

594 if port is not None:

595 check_int_range(port, "port", 1, 65535)

596

597 path: str | None = res.path

598 if str.__len__(path) > 0:

599 _check_url_part(path, _FORBIDDEN_IN_FULL_URL)

600 else:

601 path = None

602

603 if is_mailto != (scheme == _MAILTO_1): # this should be impossible

604 raise ValueError(f"url {url!r} has scheme {scheme!r}?")

605 requires_at: Final[bool] = is_mailto or (

606 scheme in _REQUIRE_USER_NAME_SCHEMES)

607 has_at: Final[bool] = "@" in netloc

608 has_user: Final[bool] = (res.username is not None) and (

609 str.__len__(res.username) > 0)

610 if requires_at != (has_at and has_user):

611 raise ValueError(

612 f"{scheme!r} url {url!r} must {'' if requires_at else 'not '}"

613 f"contain '@' and have username, but got "

614 f"{'@' if has_at else 'no @'} and "

615 f"{repr(res.username) if has_user else 'no username'}.")

616

617 if ((str.__len__(res.query) != 0) or (str.__len__(res.params) != 0)

618 or (res.password is not None)):

619 # should be impossible, as our regex check already picks this up

620 raise ValueError(

621 f"Query/parameters/password found in url {url!r}.")

622

623 fragment: str | None = res.fragment

624 if str.__len__(fragment) <= 0:

625 fragment = None

626 else:

627 _check_url_part(fragment, _FORBIDDEN_IN_FRAGMENT)

628

629 result = super().__new__(cls, _check_url_part(

630 res.geturl(), _FORBIDDEN_IN_FULL_URL))

631

632 #: the protocol scheme

633 result.scheme: Final[str] = scheme # type: ignore

634 #: the network location: user@host:port

635 result.netloc: Final[str] = netloc # type: ignore

636 #: the host

637 result.host: Final[str] = host # type: ignore

638 #: the port, if any (else `None`)

639 result.port: Final[int | None] = port # type: ignore

640 #: the path, if any (else `None`)

641 result.path: Final[str | None] = path # type: ignore

642 #: the path fragment, if any (else `None`)

643 result.fragment: Final[str | None] = fragment # type: ignore

644 return result

Coverage for pycommons / net / url.py: 98%

86 statements