Coverage for pycommons/dev/tests/links_in

1"""Test all the links in."""

2from os import environ

3from random import randint

4from time import sleep

5from typing import Final, cast

7# noinspection PyPackageRequirements

8from certifi import where

10# noinspection PyPackageRequirements

11from urllib3 import PoolManager # type: ignore

13# noinspection PyPackageRequirements

14from urllib3.response import HTTPResponse # type: ignore

16from pycommons.io.console import logger

17from pycommons.io.path import UTF8, Path, file_path

18from pycommons.net.url import URL

19from pycommons.strings.string_tools import replace_str

20from pycommons.types import check_int_range, type_error

22#: The hosts that somtimes are unreachable from my local machine.

23#: When the test is executed in a GitHub workflow, all hosts should be

24#: reachable, except sometimes our institute's website and fsf.org.

25__SOMETIMES_UNREACHABLE_HOSTS: Final[set[str]] = \

26 {"fsf.org"} if "GITHUB_JOB" in environ else \

27 {"fsf.org", "img.shields.io", "pypi.org", "docs.python.org"}

29#: URLs that we never need to check because they are OK

30__CORRECT_URLS: Final[set[str]] = {

31 "https://example.com", "http://example.com",

32 "https://github.com", "http://github.com",

33 "https://www.acm.org/publications/policies/artifact-review"

34 "-and-badging-current"}

37def __ve(msg: str, text: str, idx: int) -> ValueError:

38 """

39 Raise a value error for the given text piece.

41 :param msg: the message

42 :param text: the string

43 :param idx: the index

44 :returns: a :class:`ValueError` ready to be raised

45 :raises TypeError: if either argument is of the wrong type

47 >>> try:

48 ... __ve(None, " ", 1)

49 ... except TypeError as te:

50 ... print(te)

51 descriptor '__len__' requires a 'str' object but received a 'NoneType'

53 >>> try:

54 ... __ve(1, " ", 1)

55 ... except TypeError as te:

56 ... print(te)

57 descriptor '__len__' requires a 'str' object but received a 'int'

59 >>> try:

60 ... __ve("bla", None, 1)

61 ... except TypeError as te:

62 ... print(te)

63 descriptor '__len__' requires a 'str' object but received a 'NoneType'

65 >>> try:

66 ... __ve("bla", 1, 1)

67 ... except TypeError as te:

68 ... print(te)

69 descriptor '__len__' requires a 'str' object but received a 'int'

71 >>> try:

72 ... __ve("bla", "txt", None)

73 ... except TypeError as te:

74 ... print(te)

75 idx should be an instance of int but is None.

77 >>> try:

78 ... __ve("bla", "txt", "x")

79 ... except TypeError as te:

80 ... print(te)

81 idx should be an instance of int but is str, namely 'x'.

83 >>> print(repr(__ve("", "txt", 1)))

84 ValueError('Empty message!')

86 >>> print(repr(__ve("msg", "", 1)))

87 ValueError("Empty text '' for message 'msg'.")

89 >>> print(repr(__ve("msg", "txt", 5)))

90 ValueError("Index 5 is outside of text of length 3 for message 'msg'.")

92 >>> print(repr(__ve("msg", "long text", 2)))

93 ValueError("msg: '...long text...'")

94 """

95 if str.__len__(msg) == 0:

96 return ValueError("Empty message!")

97 len_text: Final[int] = str.__len__(text)

98 if len_text <= 0:

99 return ValueError(f"Empty text {text!r} for message {msg!r}.")

100 if not isinstance(idx, int):

101 raise type_error(idx, "idx", int)

102 if len_text <= idx:

103 return ValueError(f"Index {idx} is outside of text of length"

104 f" {len_text} for message {msg!r}.")

105 piece = text[max(0, idx - 32):min(len_text, idx + 64)].strip()

106 return ValueError(f"{msg}: '...{piece}...'")

107

108

109def __make_headers() -> tuple[dict[str, str] | None, ...]:

110 """

111 Make the headers.

112

113 :returns: the headers

114 """

115 headers: list[dict[str, str] | None] = [None]

116 headers.extend(

117 {"User-Agent": ua} for ua in (

118 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101"

119 " Firefox/138.0",

120 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ("

121 "KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",

122 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like"

123 " Gecko) Chrome/136.0.0.0 Safari/537.36",

124 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101"

125 " Firefox/106.0",

126 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "

127 "Gecko) Chrome/109.0.0.0 Safari/537.36",

128 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "

129 "(KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0."

130 "1518.55",

131 "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 "

132 "Version/12.16.2",

133 "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) "

134 "like Gecko",

135 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/"

136 "537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",

137 "Mozilla/5.0 (PLAYSTATION 3; 3.55)",

138 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ("

139 "KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/114.0.1823"

140 ".901",

141 "mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 ("

142 "khtml, like gecko) chrome/80.0.3987.87 safari/537.36 edg/80.0."

143 "361.502",

144 "Mozilla/5.0 (X11; Linux i686; rv:13.0) Gecko/13.0 Firefox/13.0",

145 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "

146 "Gecko) Ubuntu Chromium/80.0.3987.149 HeadlessChrome/80.0.3987."

147 "149 Safari/537.36"))

148 return tuple(headers)

149

150

151#: The headers to use for the HTTP requests.

152#: It seems that some websites may throttle requests.

153#: Maybe by using different headers, we can escape this.

154__HEADERS: Final[tuple[dict[str, str] | None, ...]] = __make_headers()

155del __make_headers

156

157

158def __needs_body(url: URL) -> bool:

159 """

160 Check whether we need the body of the given url.

161

162 If the complete body of the document needs to be downloaded, this function

163 returns `True`. This is the case, for example, if we are talking about

164 html documents. In this case, we need to (later) scan for internal

165 references, i.e., for stuff like `id="..."` attributes. However, if the

166 url does not point to an HTML document, maybe a PDF, then we do not need

167 the whole body and return `False`. In the latter case, it is sufficient to

168 do a `HEAD` HTTP request, in the former case we need a full `GET`.

169

170 :param url: the url string

171 :returns: `True` if the body is needed, `False` otherwise

172 :raises TypeError: if `base_url` is not a string

173

174 >>> __needs_body(URL("http://www.github.com/"))

175 True

176 >>> __needs_body(URL("http://www.github.com"))

177 True

178 >>> __needs_body(URL("http://www.github.com/1.htm"))

179 True

180 >>> __needs_body(URL("http://www.github.com/1.html"))

181 True

182 >>> __needs_body(URL("http://www.github.com/1.jpg"))

183 False

184 >>> __needs_body(URL("http://www.github.com/1"))

185 True

186

187 >>> try:

188 ... __needs_body(None)

189 ... except TypeError as te:

190 ... print(str(te)[:59])

191 url should be an instance of pycommons.net.url.URL but is N

192

193 >>> try:

194 ... __needs_body(1)

195 ... except TypeError as te:

196 ... print(str(te)[:59])

197 url should be an instance of pycommons.net.url.URL but is i

198 """

199 if not isinstance(url, URL):

200 raise type_error(url, "url", URL)

201 return (url.path is None) or str.endswith(

202 url.path, (".html", ".htm", "/")) or ("." not in url.path)

203

204

205def __find_fragment_html(body: str, fragment: str, url: URL) -> None:

206 r"""

207 Check whether the fragment is contained in the body as ID.

208

209 :param body: the body that was loaded

210 :param fragment: the fragment

211 :param url: the url from which the body was loaded

212 :raises TypeError: if `body`, `fragment`, or `url` are not all strings

213 :raises ValueError: if `body` does not contain `fragment` as an ID

214 somewhere

215

216 >>> __find_fragment_html("bla", "1",

217 ... URL("http://example.com#1"))

218 >>> __find_fragment_html("bla", "1",

219 ... URL("http://example.com#1"))

220 >>> __find_fragment_html("bla", "1",

221 ... URL("http://example.com#1"))

222

223 >>> try:

224 ... __find_fragment_html(None, "1", URL("http://example.com#1"))

225 ... except TypeError as te:

226 ... print(te)

227 descriptor '__len__' requires a 'str' object but received a 'NoneType'

228

229 >>> try:

230 ... __find_fragment_html(1, "1", URL("http://example.com#1"))

231 ... except TypeError as te:

232 ... print(te)

233 descriptor '__len__' requires a 'str' object but received a 'int'

234

235 >>> try:

236 ... __find_fragment_html("bla", None,

237 ... URL("http://example.com#1"))

238 ... except TypeError as te:

239 ... print(te)

240 descriptor '__len__' requires a 'str' object but received a 'NoneType'

241

242 >>> try:

243 ... __find_fragment_html("bla", 1,

244 ... URL("http://example.com#1"))

245 ... except TypeError as te:

246 ... print(te)

247 descriptor '__len__' requires a 'str' object but received a 'int'

248

249 >>> try:

250 ... __find_fragment_html("bla", None,

251 ... URL("http://example.com#1"))

252 ... except TypeError as te:

253 ... print(te)

254 descriptor '__len__' requires a 'str' object but received a 'NoneType'

255

256 >>> try:

257 ... __find_fragment_html("bla", 1,

258 ... URL("http://example.com#1"))

259 ... except TypeError as te:

260 ... print(te)

261 descriptor '__len__' requires a 'str' object but received a 'int'

262

263 >>> try:

264 ... __find_fragment_html("bla", "1", None)

265 ... except TypeError as te:

266 ... print(te)

267 url should be an instance of pycommons.net.url.URL but is None.

268

269 >>> try:

270 ... __find_fragment_html("bla", "1", 1)

271 ... except TypeError as te:

272 ... print(te)

273 url should be an instance of pycommons.net.url.URL but is int, namely 1.

274

275 >>> try:

276 ... __find_fragment_html("", "1",

277 ... URL("http://example.com#1"))

278 ... except ValueError as ve:

279 ... print(ve)

280 Empty body: ''.

281

282 >>> try:

283 ... __find_fragment_html("bla", "",

284 ... URL("http://example.com"))

285 ... except ValueError as ve:

286 ... print(ve)

287 Empty fragment: ''.

288

289 >>> try:

290 ... __find_fragment_html("bla", "1",

291 ... URL("http://example.com"))

292 ... except ValueError as ve:

293 ... print(ve)

294 Url 'http://example.com' does not end in fragment '1'.

295

296 >>> try:

297 ... __find_fragment_html("bla", "1",

298 ... URL("http://example.com#1"))

299 ... except ValueError as ve:

300 ... print(str(ve)[:-4])

301 Did not find id='1' of 'http://example.com#1' in body "bla</

302 """

303 if str.__len__(body) <= 0:

304 raise ValueError(f"Empty body: {body!r}.")

305 if str.__len__(fragment) <= 0:

306 raise ValueError(f"Empty fragment: {fragment!r}.")

307 if not isinstance(url, URL):

308 raise type_error(url, "url", URL)

309 if not url.endswith(fragment):

310 raise ValueError(

311 f"Url {url!r} does not end in fragment {fragment!r}.")

312

313 for qt in ("", "'", '"'):

314 if f"id={qt}{fragment}{qt}" in body:

315 return

316

317 raise ValueError(

318 f"Did not find id={fragment!r} of {url!r} in body {body!r}.")

319

320

321def __check_url(urlstr: str, valid_urls: dict[str, str | None],

322 http: PoolManager = PoolManager(

323 cert_reqs="CERT_REQUIRED", ca_certs=where())) -> None:

324 r"""

325 Check whether a URL is valid and can be reached.

326

327 :param urlstr: the URL to be checked

328 :param valid_urls: the set of valid urls

329 :param http: the pool manager

330 :raises TypeError: if any of the parameters is of the wrong type

331 :raises ValueError: if the url `urlstr` cannot be loaded or if it has a

332 fragment part that is not discovered in the body of the loaded

333 document.

334

335 >>> vu = dict()

336 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu)

337 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu)

338 >>> __check_url("tweise@hfuu.edu.cn", vu)

339

340 >>> from contextlib import redirect_stdout

341

342 >>> with redirect_stdout(None):

343 ... __check_url("https://thomasweise.github.io/pycommons", vu)

344 ... __check_url("http://example.com/", vu)

345 ... __check_url("https://thomasweise.github.io/pycommons/pycommons"

346 ... ".io.html", vu)

347 >>> __check_url("https://thomasweise.github.io/pycommons", vu)

348 >>> __check_url(

349 ... "https://thomasweise.github.io/pycommons/pycommons.io.html", vu)

350

351 >>> __check_url("https://thomasweise.github.io/pycommons/pycommons"

352 ... ".io.html#pycommons.io.path.Path", vu)

353 >>> __check_url("http://example.com", vu)

354

355 >>> try:

356 ... __check_url("bwri435//sdfsdf:-@@", vu)

357 ... except ValueError as ve:

358 ... print(str(ve)[:50])

359 Error in url 'bwri435//sdfsdf:-@@': URL part 'bwri

360

361 >>> with redirect_stdout(None):

362 ... try:

363 ... __check_url(

364 ... "https://thomasweise.github.io/sifwrwruS.jpg#34", vu)

365 ... except ValueError as ve:

366 ... s = str(ve)

367 >>> print(s[:61])

368 Url 'https://thomasweise.github.io/sifwrwruS.jpg#34' does not

369

370 >>> with redirect_stdout(None):

371 ... try:

372 ... __check_url("ssh://u@thomasweise.github.io/sifwrwruSSXFd", vu)

373 ... except ValueError as ve:

374 ... s = str(ve)

375 >>> print(s)

376 Invalid scheme for url 'ssh://u@thomasweise.github.io/sifwrwruSSXFd'.

377

378 >>> with redirect_stdout(None):

379 ... try:

380 ... __check_url(

381 ... "https://thomasweise.github.io/sifwrwruSSXFdfDX", vu)

382 ... except ValueError as ve:

383 ... s = str(ve)

384 >>> s.endswith("returns code 404.") or s.startswith("Could not load url")

385 True

386

387 >>> try:

388 ... __check_url(None, dict())

389 ... except TypeError as te:

390 ... print(te)

391 descriptor '__len__' requires a 'str' object but received a 'NoneType'

392

393 >>> try:

394 ... __check_url(1, dict())

395 ... except TypeError as te:

396 ... print(te)

397 descriptor '__len__' requires a 'str' object but received a 'int'

398

399 >>> try:

400 ... __check_url("http://example.com", None)

401 ... except TypeError as te:

402 ... print(te)

403 valid_urls should be an instance of dict but is None.

404

405 >>> try:

406 ... __check_url("http://example.com", 1)

407 ... except TypeError as te:

408 ... print(te)

409 valid_urls should be an instance of dict but is int, namely 1.

410

411 >>> try:

412 ... __check_url("http://example.com", dict(), None)

413 ... except TypeError as te:

414 ... print(te)

415 http should be an instance of urllib3.poolmanager.PoolManager but is None.

416

417 >>> try:

418 ... __check_url("http://example.com", dict(), 1)

419 ... except TypeError as te:

420 ... print(str(te)[:50])

421 http should be an instance of urllib3.poolmanager.

422 """

423 if not isinstance(valid_urls, dict):

424 raise type_error(valid_urls, "valid_urls", dict)

425 if not isinstance(http, PoolManager):

426 raise type_error(http, "http", PoolManager)

427

428 if urlstr in valid_urls:

429 return

430

431 try:

432 url: Final[URL] = URL(urlstr)

433 except ValueError as ve:

434 raise ValueError(f"Error in url {urlstr!r}: {ve}") from None

435

436 if (url in __CORRECT_URLS) or (url in valid_urls):

437 return

438 if url.scheme == "mailto":

439 return

440 if not url.scheme.startswith("http"):

441 raise ValueError(f"Invalid scheme for url {url!r}.")

442

443 needs_body: Final[bool] = __needs_body(url)

444

445 base_url: URL = url

446 fragment: Final[str | None] = url.fragment

447 if fragment is not None:

448 base_url = URL(url[:url.index("#")])

449 if not needs_body:

450 raise ValueError(

451 f"Url {url!r} does not need body but has "

452 f"fragment {url.fragment!r}?")

453 if base_url in valid_urls:

454 __find_fragment_html(valid_urls[base_url], fragment, url)

455 return

456

457 code: int

458 body: str | None = None

459 method = "GET" if needs_body else "HEAD"

460 error: BaseException | None = None

461 response: HTTPResponse | None = None

462 headers: Final[list[dict[str, str] | None]] = list(__HEADERS)

463 header_count: int = 0

464

465# Sometimes, access to the URLs on GitHub fails.

466# I think they probably throttle access from here.

467# Therefore, we first do a request with 5s timeout and 0 retries.

468# If that fails, we wait 2 seconds and try with timeout 8 and 3 retries.

469# If that fails, we wait for 5s, then try with timeout 30 and 3 retries.

470# If that fails too, we assume that the URL is really incorrect, which rarely

471# should not be the case (justifying the many retries).

472 for sleep_time, retries, timeout in (

473 (0, 0, 5), (2, 3, 8), (5, 3, 30)):

474 if sleep_time > 0:

475 sleep(sleep_time)

476

477# We try to get a random header to deal with the problem that some pages

478# will not permit certain user agents. To handle this issue, we try to not

479# use any user agent twice. We randomly pick a user agent and, if it fails,

480# make sure to use all other user agents first before we use that one again.

481 if header_count <= 0:

482 header_count = len(headers)

483 header_idx = randint(0, header_count - 1) # noqa: S311

484 header: dict[str, str] | None = headers[header_idx]

485 header_count -= 1

486 headers[header_count], headers[header_idx] \

487 = header, headers[header_count]

488 try:

489 response = cast("HTTPResponse", http.request(

490 method, base_url, timeout=timeout, redirect=True,

491 retries=retries, headers=header))

492 if isinstance(response, HTTPResponse) and isinstance(

493 response.status, int) and (response.status == 200):

494 error = None

495 break

496 except BaseException as be: # noqa

497 logger(f"Attempt sleep={sleep_time}, retries={retries}, "

498 f"timeout={timeout}, error={str(be)!r}, and "

499 f"header={header!r} for {base_url!r} gave {be}.")

500 error = be

501

502 if error is not None:

503 # sometimes, I cannot reach some hosts from here...

504 if url.host in __SOMETIMES_UNREACHABLE_HOSTS:

505 return # we will accept this here

506 raise ValueError(f"Could not load url {url!r}.") from error

507

508 if not isinstance(response, HTTPResponse): # should be impossible...

509 raise ValueError(f"Response {response} from url={url!r}?") # noqa

510

511 code = check_int_range(response.status, "response.status", 0, 10000)

512 if needs_body:

513 try:

514 body = str.strip(response.data.decode(UTF8))

515 except BaseException as be: # noqa

516 raise ValueError(f"Error in body of url {url!r}: {be}") from be

517

518 body_len: Final[int] = 0 if body is None else str.__len__(body)

519 logger(f"Checked url {url!r} got code {code} for method {method!r} and "

520 f"{body_len} chars.")

521 if code != 200:

522 raise ValueError(f"Url {url!r} returns code {code}.")

523

524 if needs_body and ((body is None) or (body_len <= 0)):

525 raise ValueError(

526 f"Stripped body for {url!r} / {base_url!r} is {body!r}?")

527

528 valid_urls[base_url] = body

529 if url is not base_url:

530 valid_urls[url] = body

531

532 if fragment is not None:

533 __find_fragment_html(body, fragment, url)

534

535

536def check_links_in_md(file: str) -> None:

537 """

538 Test all the links in the given file.

539

540 :param file: the file to check

541 """

542 # First, we load the file as a single string

543 readme: Final[Path] = file_path(file)

544 logger(f"Checking all links in the file {readme!r}.")

545

546 text: str = readme.read_all_str()

547 text_len: int = str.__len__(text)

548 logger(f"Got {text_len} characters from file {readme!r}.")

549 if text_len <= 0:

550 raise ValueError(f"{readme!r} file is empty?")

551

552 # remove all code blocks

553 total_links_checked: int = 0

554 start: int = -1

555 lines: Final[list[str]] = []

556 while True:

557 start += 1

558 i: int = text.find("\n```", start)

559 if i < start:

560 lines.append(text[start:].strip())

561 break

562 j: int = text.find("\n```", i + 1)

563 if j < i:

564 raise __ve("Multi-line code start without "

565 f"end in file {readme!r}", text, i)

566 k: int = text.find("\n", j + 1)

567 if k < j:

568 raise __ve(f"Code end without newline in file {readme!r}",

569 text, i)

570 lines.append(text[start:i].strip())

571 start = k

572

573 text = "\n".join(lines).strip()

574 lines.clear()

575

576 # these are all urls that have been verified

577 valid_urls: Final[dict[str, str | None]] = {}

578

579 # build the map of local reference marks

580 start = -1

581 while True:

582 start += 1

583 i = 0 if ((start == 0) and text.startswith("#")) \

584 else text.find("\n#", start)

585 if i < start:

586 break

587 j = text.find(" ", i + 1)

588 if (j < i) or (text[j - 1] != "#"):

589 raise __ve("Headline without space after # "

590 f"in file {readme!r}", text, i)

591 k = text.find("\n", j + 1)

592 if k < j:

593 raise __ve(f"Headline without end in file {readme!r}", text, i)

594 rid: str = text[j:k].strip().replace(" ", "-")

595 for ch in ".:,()`/":

596 rid = rid.replace(ch, "")

597 rid = replace_str("--", "-", rid).lower()

598 if (str.__len__(rid) <= 2) or ((rid[0] not in "123456789") and (

599 start > 0)) or ("-" not in rid):

600 raise __ve(f"Invalid id {rid!r} in file {readme!r}", text, i)

601 valid_urls[f"#{rid}"] = None

602 start = k

603

604 # remove all inline code

605 start = -1

606 while True:

607 start += 1

608 i = text.find("`", start)

609 if i < start:

610 lines.append(text[start:].strip())

611 break

612 j = text.find("`", i + 1)

613 if j < i:

614 raise __ve("Multi-line code start "

615 f"without end in file {readme!r}", text, i)

616 lines.append(text[start:i].strip())

617 start = j

618 text = "\n".join(lines).strip()

619 lines.clear()

620

621 logger(f"Now checking '![...]()' style urls in file {readme!r}.")

622

623 # now gather the links to images and remove them

624 start = -1

625 lines.clear()

626 while True:

627 start += 1

628 i = text.find("![", start)

629 if i < start:

630 lines.append(text[start:])

631 break

632 j = text.find("]", i + 1)

633 if j <= i:

634 break

635 if "\n" in text[i:j]:

636 start = i

637 j += 1

638 if text[j] != "(":

639 raise __ve(f"Invalid image sequence in file {readme!r}", text, i)

640 k = text.find(")", j + 1)

641 if k <= j:

642 raise __ve("No closing gap for image sequence "

643 f"in file {readme!r}", text, i)

644

645 __check_url(text[j + 1:k], valid_urls)

646 total_links_checked += 1

647

648 lines.append(text[start:i])

649 start = k

650

651 text = "\n".join(lines)

652 lines.clear()

653

654 logger(f"Now checking '[...]()' style urls in file {readme!r}.")

655

656 # now gather the links and remove them

657 start = -1

658 lines.clear()

659 while True:

660 start += 1

661 i = text.find("[", start)

662 if i < start:

663 lines.append(text[start:])

664 break

665 j = text.find("]", i + 1)

666 if j <= i:

667 break

668 if "\n" in text[i:j]:

669 lines.append(text[start:i])

670 start = i

671 continue

672 j += 1

673 if text[j] != "(":

674 raise __ve(f"Invalid [...](...) link in file {readme!r}", text, i)

675 k = text.find(")", j + 1)

676 if k <= j:

677 raise __ve("No closing gap for [...](...)"

678 f" link in file {readme!r}", text, i)

679

680 __check_url(text[j + 1:k], valid_urls)

681 total_links_checked += 1

682

683 lines.append(text[start:i])

684 start = k

685

686 text = "\n".join(lines)

687 lines.clear()

688

689 logger(f"Now checking ' href=' style urls in file {readme!r}.")

690

691 # now gather the href links and remove them

692 for quot in "'\"":

693 start = -1

694 lines.clear()

695 while True:

696 start += 1

697 start_str = f" href={quot}"

698 i = text.find(start_str, start)

699 if i < start:

700 lines.append(text[start:])

701 break

702 j = text.find(quot, i + len(start_str))

703 if j <= i:

704 break

705 if "\n" in text[i:j]:

706 lines.append(text[start:i])

707 start = i

708 continue

709 __check_url(text[i + len(start_str):j], valid_urls)

710 total_links_checked += 1

711

712 lines.append(text[start:i])

713 start = j

714

715 text = "\n".join(lines)

716 lines.clear()

717

718 logger(f"Now checking ' src=' style urls in file {readme!r}.")

719 # now gather the image links and remove them

720 for quot in "'\"":

721 start = -1

722 lines.clear()

723 while True:

724 start += 1

725 start_str = f" src={quot}"

726 i = text.find(start_str, start)

727 if i < start:

728 lines.append(text[start:])

729 break

730 j = text.find(quot, i + len(start_str))

731 if j <= i:

732 break

733 if "\n" in text[i:j]:

734 lines.append(text[start:i])

735 start = i

736 continue

737 __check_url(text[i + len(start_str):j], valid_urls)

738 total_links_checked += 1

739

740 lines.append(text[start:i])

741 start = j

742

743 text = "\n".join(lines)

744 lines.clear()

745

746 logger(f"Now checking '<...>' style urls in file {readme!r}.")

747 start = -1

748 lines.clear()

749 while True:

750 start += 1

751 i = text.find("<http", start)

752 if i < start:

753 lines.append(text[start:])

754 break

755 j = text.find(">", i + 1)

756 if j <= i:

757 break

758 if "\n" in text[i:j]:

759 lines.append(text[start:i])

760 start = i

761 continue

762 __check_url(text[i + 1:j], valid_urls)

763 total_links_checked += 1

764

765 lines.append(text[start:i])

766 start = j

767

768 if total_links_checked <= 0:

769 raise ValueError(f"Found no links in file {readme!r}.")

770 logger(f"Finished testing all links {total_links_checked} in "

771 f"file {readme!r}.")

Coverage for pycommons / dev / tests / links_in_md.py: 92%

309 statements