Coverage for pycommons / dev / tests / links_in_md.py: 92%
309 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
1"""Test all the links in."""
2from os import environ
3from random import randint
4from time import sleep
5from typing import Final, cast
7# noinspection PyPackageRequirements
8from certifi import where
10# noinspection PyPackageRequirements
11from urllib3 import PoolManager # type: ignore
13# noinspection PyPackageRequirements
14from urllib3.response import HTTPResponse # type: ignore
16from pycommons.io.console import logger
17from pycommons.io.path import UTF8, Path, file_path
18from pycommons.net.url import URL
19from pycommons.strings.string_tools import replace_str
20from pycommons.types import check_int_range, type_error
22#: The hosts that somtimes are unreachable from my local machine.
23#: When the test is executed in a GitHub workflow, all hosts should be
24#: reachable, except sometimes our institute's website and fsf.org.
25__SOMETIMES_UNREACHABLE_HOSTS: Final[set[str]] = \
26 {"fsf.org"} if "GITHUB_JOB" in environ else \
27 {"fsf.org", "img.shields.io", "pypi.org", "docs.python.org"}
29#: URLs that we never need to check because they are OK
30__CORRECT_URLS: Final[set[str]] = {
31 "https://example.com", "http://example.com",
32 "https://github.com", "http://github.com",
33 "https://www.acm.org/publications/policies/artifact-review"
34 "-and-badging-current"}
37def __ve(msg: str, text: str, idx: int) -> ValueError:
38 """
39 Raise a value error for the given text piece.
41 :param msg: the message
42 :param text: the string
43 :param idx: the index
44 :returns: a :class:`ValueError` ready to be raised
45 :raises TypeError: if either argument is of the wrong type
47 >>> try:
48 ... __ve(None, " ", 1)
49 ... except TypeError as te:
50 ... print(te)
51 descriptor '__len__' requires a 'str' object but received a 'NoneType'
53 >>> try:
54 ... __ve(1, " ", 1)
55 ... except TypeError as te:
56 ... print(te)
57 descriptor '__len__' requires a 'str' object but received a 'int'
59 >>> try:
60 ... __ve("bla", None, 1)
61 ... except TypeError as te:
62 ... print(te)
63 descriptor '__len__' requires a 'str' object but received a 'NoneType'
65 >>> try:
66 ... __ve("bla", 1, 1)
67 ... except TypeError as te:
68 ... print(te)
69 descriptor '__len__' requires a 'str' object but received a 'int'
71 >>> try:
72 ... __ve("bla", "txt", None)
73 ... except TypeError as te:
74 ... print(te)
75 idx should be an instance of int but is None.
77 >>> try:
78 ... __ve("bla", "txt", "x")
79 ... except TypeError as te:
80 ... print(te)
81 idx should be an instance of int but is str, namely 'x'.
83 >>> print(repr(__ve("", "txt", 1)))
84 ValueError('Empty message!')
86 >>> print(repr(__ve("msg", "", 1)))
87 ValueError("Empty text '' for message 'msg'.")
89 >>> print(repr(__ve("msg", "txt", 5)))
90 ValueError("Index 5 is outside of text of length 3 for message 'msg'.")
92 >>> print(repr(__ve("msg", "long text", 2)))
93 ValueError("msg: '...long text...'")
94 """
95 if str.__len__(msg) == 0:
96 return ValueError("Empty message!")
97 len_text: Final[int] = str.__len__(text)
98 if len_text <= 0:
99 return ValueError(f"Empty text {text!r} for message {msg!r}.")
100 if not isinstance(idx, int):
101 raise type_error(idx, "idx", int)
102 if len_text <= idx:
103 return ValueError(f"Index {idx} is outside of text of length"
104 f" {len_text} for message {msg!r}.")
105 piece = text[max(0, idx - 32):min(len_text, idx + 64)].strip()
106 return ValueError(f"{msg}: '...{piece}...'")
109def __make_headers() -> tuple[dict[str, str] | None, ...]:
110 """
111 Make the headers.
113 :returns: the headers
114 """
115 headers: list[dict[str, str] | None] = [None]
116 headers.extend(
117 {"User-Agent": ua} for ua in (
118 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101"
119 " Firefox/138.0",
120 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ("
121 "KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",
122 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like"
123 " Gecko) Chrome/136.0.0.0 Safari/537.36",
124 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101"
125 " Firefox/106.0",
126 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
127 "Gecko) Chrome/109.0.0.0 Safari/537.36",
128 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
129 "(KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0."
130 "1518.55",
131 "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 "
132 "Version/12.16.2",
133 "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) "
134 "like Gecko",
135 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/"
136 "537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
137 "Mozilla/5.0 (PLAYSTATION 3; 3.55)",
138 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ("
139 "KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/114.0.1823"
140 ".901",
141 "mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 ("
142 "khtml, like gecko) chrome/80.0.3987.87 safari/537.36 edg/80.0."
143 "361.502",
144 "Mozilla/5.0 (X11; Linux i686; rv:13.0) Gecko/13.0 Firefox/13.0",
145 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
146 "Gecko) Ubuntu Chromium/80.0.3987.149 HeadlessChrome/80.0.3987."
147 "149 Safari/537.36"))
148 return tuple(headers)
151#: The headers to use for the HTTP requests.
152#: It seems that some websites may throttle requests.
153#: Maybe by using different headers, we can escape this.
154__HEADERS: Final[tuple[dict[str, str] | None, ...]] = __make_headers()
155del __make_headers
158def __needs_body(url: URL) -> bool:
159 """
160 Check whether we need the body of the given url.
162 If the complete body of the document needs to be downloaded, this function
163 returns `True`. This is the case, for example, if we are talking about
164 html documents. In this case, we need to (later) scan for internal
165 references, i.e., for stuff like `id="..."` attributes. However, if the
166 url does not point to an HTML document, maybe a PDF, then we do not need
167 the whole body and return `False`. In the latter case, it is sufficient to
168 do a `HEAD` HTTP request, in the former case we need a full `GET`.
170 :param url: the url string
171 :returns: `True` if the body is needed, `False` otherwise
172 :raises TypeError: if `base_url` is not a string
174 >>> __needs_body(URL("http://www.github.com/"))
175 True
176 >>> __needs_body(URL("http://www.github.com"))
177 True
178 >>> __needs_body(URL("http://www.github.com/1.htm"))
179 True
180 >>> __needs_body(URL("http://www.github.com/1.html"))
181 True
182 >>> __needs_body(URL("http://www.github.com/1.jpg"))
183 False
184 >>> __needs_body(URL("http://www.github.com/1"))
185 True
187 >>> try:
188 ... __needs_body(None)
189 ... except TypeError as te:
190 ... print(str(te)[:59])
191 url should be an instance of pycommons.net.url.URL but is N
193 >>> try:
194 ... __needs_body(1)
195 ... except TypeError as te:
196 ... print(str(te)[:59])
197 url should be an instance of pycommons.net.url.URL but is i
198 """
199 if not isinstance(url, URL):
200 raise type_error(url, "url", URL)
201 return (url.path is None) or str.endswith(
202 url.path, (".html", ".htm", "/")) or ("." not in url.path)
205def __find_fragment_html(body: str, fragment: str, url: URL) -> None:
206 r"""
207 Check whether the fragment is contained in the body as ID.
209 :param body: the body that was loaded
210 :param fragment: the fragment
211 :param url: the url from which the body was loaded
212 :raises TypeError: if `body`, `fragment`, or `url` are not all strings
213 :raises ValueError: if `body` does not contain `fragment` as an ID
214 somewhere
216 >>> __find_fragment_html("<p id='1'>bla</p>", "1",
217 ... URL("http://example.com#1"))
218 >>> __find_fragment_html("<p id=\"1\">bla</p>", "1",
219 ... URL("http://example.com#1"))
220 >>> __find_fragment_html("<p id=1>bla</p>", "1",
221 ... URL("http://example.com#1"))
223 >>> try:
224 ... __find_fragment_html(None, "1", URL("http://example.com#1"))
225 ... except TypeError as te:
226 ... print(te)
227 descriptor '__len__' requires a 'str' object but received a 'NoneType'
229 >>> try:
230 ... __find_fragment_html(1, "1", URL("http://example.com#1"))
231 ... except TypeError as te:
232 ... print(te)
233 descriptor '__len__' requires a 'str' object but received a 'int'
235 >>> try:
236 ... __find_fragment_html("<p id='1'>bla</p>", None,
237 ... URL("http://example.com#1"))
238 ... except TypeError as te:
239 ... print(te)
240 descriptor '__len__' requires a 'str' object but received a 'NoneType'
242 >>> try:
243 ... __find_fragment_html("<p id='1'>bla</p>", 1,
244 ... URL("http://example.com#1"))
245 ... except TypeError as te:
246 ... print(te)
247 descriptor '__len__' requires a 'str' object but received a 'int'
249 >>> try:
250 ... __find_fragment_html("<p id='1'>bla</p>", None,
251 ... URL("http://example.com#1"))
252 ... except TypeError as te:
253 ... print(te)
254 descriptor '__len__' requires a 'str' object but received a 'NoneType'
256 >>> try:
257 ... __find_fragment_html("<p id='1'>bla</p>", 1,
258 ... URL("http://example.com#1"))
259 ... except TypeError as te:
260 ... print(te)
261 descriptor '__len__' requires a 'str' object but received a 'int'
263 >>> try:
264 ... __find_fragment_html("<p id='1'>bla</p>", "1", None)
265 ... except TypeError as te:
266 ... print(te)
267 url should be an instance of pycommons.net.url.URL but is None.
269 >>> try:
270 ... __find_fragment_html("<p id='1'>bla</p>", "1", 1)
271 ... except TypeError as te:
272 ... print(te)
273 url should be an instance of pycommons.net.url.URL but is int, namely 1.
275 >>> try:
276 ... __find_fragment_html("", "1",
277 ... URL("http://example.com#1"))
278 ... except ValueError as ve:
279 ... print(ve)
280 Empty body: ''.
282 >>> try:
283 ... __find_fragment_html("<p id='1'>bla</p>", "",
284 ... URL("http://example.com"))
285 ... except ValueError as ve:
286 ... print(ve)
287 Empty fragment: ''.
289 >>> try:
290 ... __find_fragment_html("<p id='1'>bla</p>", "1",
291 ... URL("http://example.com"))
292 ... except ValueError as ve:
293 ... print(ve)
294 Url 'http://example.com' does not end in fragment '1'.
296 >>> try:
297 ... __find_fragment_html("<p id='x1'>bla</p>", "1",
298 ... URL("http://example.com#1"))
299 ... except ValueError as ve:
300 ... print(str(ve)[:-4])
301 Did not find id='1' of 'http://example.com#1' in body "<p id='x1'>bla</
302 """
303 if str.__len__(body) <= 0:
304 raise ValueError(f"Empty body: {body!r}.")
305 if str.__len__(fragment) <= 0:
306 raise ValueError(f"Empty fragment: {fragment!r}.")
307 if not isinstance(url, URL):
308 raise type_error(url, "url", URL)
309 if not url.endswith(fragment):
310 raise ValueError(
311 f"Url {url!r} does not end in fragment {fragment!r}.")
313 for qt in ("", "'", '"'):
314 if f"id={qt}{fragment}{qt}" in body:
315 return
317 raise ValueError(
318 f"Did not find id={fragment!r} of {url!r} in body {body!r}.")
321def __check_url(urlstr: str, valid_urls: dict[str, str | None],
322 http: PoolManager = PoolManager(
323 cert_reqs="CERT_REQUIRED", ca_certs=where())) -> None:
324 r"""
325 Check whether a URL is valid and can be reached.
327 :param urlstr: the URL to be checked
328 :param valid_urls: the set of valid urls
329 :param http: the pool manager
330 :raises TypeError: if any of the parameters is of the wrong type
331 :raises ValueError: if the url `urlstr` cannot be loaded or if it has a
332 fragment part that is not discovered in the body of the loaded
333 document.
335 >>> vu = dict()
336 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu)
337 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu)
338 >>> __check_url("tweise@hfuu.edu.cn", vu)
340 >>> from contextlib import redirect_stdout
342 >>> with redirect_stdout(None):
343 ... __check_url("https://thomasweise.github.io/pycommons", vu)
344 ... __check_url("http://example.com/", vu)
345 ... __check_url("https://thomasweise.github.io/pycommons/pycommons"
346 ... ".io.html", vu)
347 >>> __check_url("https://thomasweise.github.io/pycommons", vu)
348 >>> __check_url(
349 ... "https://thomasweise.github.io/pycommons/pycommons.io.html", vu)
351 >>> __check_url("https://thomasweise.github.io/pycommons/pycommons"
352 ... ".io.html#pycommons.io.path.Path", vu)
353 >>> __check_url("http://example.com", vu)
355 >>> try:
356 ... __check_url("bwri435//sdfsdf:-@@", vu)
357 ... except ValueError as ve:
358 ... print(str(ve)[:50])
359 Error in url 'bwri435//sdfsdf:-@@': URL part 'bwri
361 >>> with redirect_stdout(None):
362 ... try:
363 ... __check_url(
364 ... "https://thomasweise.github.io/sifwrwruS.jpg#34", vu)
365 ... except ValueError as ve:
366 ... s = str(ve)
367 >>> print(s[:61])
368 Url 'https://thomasweise.github.io/sifwrwruS.jpg#34' does not
370 >>> with redirect_stdout(None):
371 ... try:
372 ... __check_url("ssh://u@thomasweise.github.io/sifwrwruSSXFd", vu)
373 ... except ValueError as ve:
374 ... s = str(ve)
375 >>> print(s)
376 Invalid scheme for url 'ssh://u@thomasweise.github.io/sifwrwruSSXFd'.
378 >>> with redirect_stdout(None):
379 ... try:
380 ... __check_url(
381 ... "https://thomasweise.github.io/sifwrwruSSXFdfDX", vu)
382 ... except ValueError as ve:
383 ... s = str(ve)
384 >>> s.endswith("returns code 404.") or s.startswith("Could not load url")
385 True
387 >>> try:
388 ... __check_url(None, dict())
389 ... except TypeError as te:
390 ... print(te)
391 descriptor '__len__' requires a 'str' object but received a 'NoneType'
393 >>> try:
394 ... __check_url(1, dict())
395 ... except TypeError as te:
396 ... print(te)
397 descriptor '__len__' requires a 'str' object but received a 'int'
399 >>> try:
400 ... __check_url("http://example.com", None)
401 ... except TypeError as te:
402 ... print(te)
403 valid_urls should be an instance of dict but is None.
405 >>> try:
406 ... __check_url("http://example.com", 1)
407 ... except TypeError as te:
408 ... print(te)
409 valid_urls should be an instance of dict but is int, namely 1.
411 >>> try:
412 ... __check_url("http://example.com", dict(), None)
413 ... except TypeError as te:
414 ... print(te)
415 http should be an instance of urllib3.poolmanager.PoolManager but is None.
417 >>> try:
418 ... __check_url("http://example.com", dict(), 1)
419 ... except TypeError as te:
420 ... print(str(te)[:50])
421 http should be an instance of urllib3.poolmanager.
422 """
423 if not isinstance(valid_urls, dict):
424 raise type_error(valid_urls, "valid_urls", dict)
425 if not isinstance(http, PoolManager):
426 raise type_error(http, "http", PoolManager)
428 if urlstr in valid_urls:
429 return
431 try:
432 url: Final[URL] = URL(urlstr)
433 except ValueError as ve:
434 raise ValueError(f"Error in url {urlstr!r}: {ve}") from None
436 if (url in __CORRECT_URLS) or (url in valid_urls):
437 return
438 if url.scheme == "mailto":
439 return
440 if not url.scheme.startswith("http"):
441 raise ValueError(f"Invalid scheme for url {url!r}.")
443 needs_body: Final[bool] = __needs_body(url)
445 base_url: URL = url
446 fragment: Final[str | None] = url.fragment
447 if fragment is not None:
448 base_url = URL(url[:url.index("#")])
449 if not needs_body:
450 raise ValueError(
451 f"Url {url!r} does not need body but has "
452 f"fragment {url.fragment!r}?")
453 if base_url in valid_urls:
454 __find_fragment_html(valid_urls[base_url], fragment, url)
455 return
457 code: int
458 body: str | None = None
459 method = "GET" if needs_body else "HEAD"
460 error: BaseException | None = None
461 response: HTTPResponse | None = None
462 headers: Final[list[dict[str, str] | None]] = list(__HEADERS)
463 header_count: int = 0
465# Sometimes, access to the URLs on GitHub fails.
466# I think they probably throttle access from here.
467# Therefore, we first do a request with 5s timeout and 0 retries.
468# If that fails, we wait 2 seconds and try with timeout 8 and 3 retries.
469# If that fails, we wait for 5s, then try with timeout 30 and 3 retries.
470# If that fails too, we assume that the URL is really incorrect, which rarely
471# should not be the case (justifying the many retries).
472 for sleep_time, retries, timeout in (
473 (0, 0, 5), (2, 3, 8), (5, 3, 30)):
474 if sleep_time > 0:
475 sleep(sleep_time)
477# We try to get a random header to deal with the problem that some pages
478# will not permit certain user agents. To handle this issue, we try to not
479# use any user agent twice. We randomly pick a user agent and, if it fails,
480# make sure to use all other user agents first before we use that one again.
481 if header_count <= 0:
482 header_count = len(headers)
483 header_idx = randint(0, header_count - 1) # noqa: S311
484 header: dict[str, str] | None = headers[header_idx]
485 header_count -= 1
486 headers[header_count], headers[header_idx] \
487 = header, headers[header_count]
488 try:
489 response = cast("HTTPResponse", http.request(
490 method, base_url, timeout=timeout, redirect=True,
491 retries=retries, headers=header))
492 if isinstance(response, HTTPResponse) and isinstance(
493 response.status, int) and (response.status == 200):
494 error = None
495 break
496 except BaseException as be: # noqa
497 logger(f"Attempt sleep={sleep_time}, retries={retries}, "
498 f"timeout={timeout}, error={str(be)!r}, and "
499 f"header={header!r} for {base_url!r} gave {be}.")
500 error = be
502 if error is not None:
503 # sometimes, I cannot reach some hosts from here...
504 if url.host in __SOMETIMES_UNREACHABLE_HOSTS:
505 return # we will accept this here
506 raise ValueError(f"Could not load url {url!r}.") from error
508 if not isinstance(response, HTTPResponse): # should be impossible...
509 raise ValueError(f"Response {response} from url={url!r}?") # noqa
511 code = check_int_range(response.status, "response.status", 0, 10000)
512 if needs_body:
513 try:
514 body = str.strip(response.data.decode(UTF8))
515 except BaseException as be: # noqa
516 raise ValueError(f"Error in body of url {url!r}: {be}") from be
518 body_len: Final[int] = 0 if body is None else str.__len__(body)
519 logger(f"Checked url {url!r} got code {code} for method {method!r} and "
520 f"{body_len} chars.")
521 if code != 200:
522 raise ValueError(f"Url {url!r} returns code {code}.")
524 if needs_body and ((body is None) or (body_len <= 0)):
525 raise ValueError(
526 f"Stripped body for {url!r} / {base_url!r} is {body!r}?")
528 valid_urls[base_url] = body
529 if url is not base_url:
530 valid_urls[url] = body
532 if fragment is not None:
533 __find_fragment_html(body, fragment, url)
536def check_links_in_md(file: str) -> None:
537 """
538 Test all the links in the given file.
540 :param file: the file to check
541 """
542 # First, we load the file as a single string
543 readme: Final[Path] = file_path(file)
544 logger(f"Checking all links in the file {readme!r}.")
546 text: str = readme.read_all_str()
547 text_len: int = str.__len__(text)
548 logger(f"Got {text_len} characters from file {readme!r}.")
549 if text_len <= 0:
550 raise ValueError(f"{readme!r} file is empty?")
552 # remove all code blocks
553 total_links_checked: int = 0
554 start: int = -1
555 lines: Final[list[str]] = []
556 while True:
557 start += 1
558 i: int = text.find("\n```", start)
559 if i < start:
560 lines.append(text[start:].strip())
561 break
562 j: int = text.find("\n```", i + 1)
563 if j < i:
564 raise __ve("Multi-line code start without "
565 f"end in file {readme!r}", text, i)
566 k: int = text.find("\n", j + 1)
567 if k < j:
568 raise __ve(f"Code end without newline in file {readme!r}",
569 text, i)
570 lines.append(text[start:i].strip())
571 start = k
573 text = "\n".join(lines).strip()
574 lines.clear()
576 # these are all urls that have been verified
577 valid_urls: Final[dict[str, str | None]] = {}
579 # build the map of local reference marks
580 start = -1
581 while True:
582 start += 1
583 i = 0 if ((start == 0) and text.startswith("#")) \
584 else text.find("\n#", start)
585 if i < start:
586 break
587 j = text.find(" ", i + 1)
588 if (j < i) or (text[j - 1] != "#"):
589 raise __ve("Headline without space after # "
590 f"in file {readme!r}", text, i)
591 k = text.find("\n", j + 1)
592 if k < j:
593 raise __ve(f"Headline without end in file {readme!r}", text, i)
594 rid: str = text[j:k].strip().replace(" ", "-")
595 for ch in ".:,()`/":
596 rid = rid.replace(ch, "")
597 rid = replace_str("--", "-", rid).lower()
598 if (str.__len__(rid) <= 2) or ((rid[0] not in "123456789") and (
599 start > 0)) or ("-" not in rid):
600 raise __ve(f"Invalid id {rid!r} in file {readme!r}", text, i)
601 valid_urls[f"#{rid}"] = None
602 start = k
604 # remove all inline code
605 start = -1
606 while True:
607 start += 1
608 i = text.find("`", start)
609 if i < start:
610 lines.append(text[start:].strip())
611 break
612 j = text.find("`", i + 1)
613 if j < i:
614 raise __ve("Multi-line code start "
615 f"without end in file {readme!r}", text, i)
616 lines.append(text[start:i].strip())
617 start = j
618 text = "\n".join(lines).strip()
619 lines.clear()
621 logger(f"Now checking '![...]()' style urls in file {readme!r}.")
623 # now gather the links to images and remove them
624 start = -1
625 lines.clear()
626 while True:
627 start += 1
628 i = text.find("![", start)
629 if i < start:
630 lines.append(text[start:])
631 break
632 j = text.find("]", i + 1)
633 if j <= i:
634 break
635 if "\n" in text[i:j]:
636 start = i
637 j += 1
638 if text[j] != "(":
639 raise __ve(f"Invalid image sequence in file {readme!r}", text, i)
640 k = text.find(")", j + 1)
641 if k <= j:
642 raise __ve("No closing gap for image sequence "
643 f"in file {readme!r}", text, i)
645 __check_url(text[j + 1:k], valid_urls)
646 total_links_checked += 1
648 lines.append(text[start:i])
649 start = k
651 text = "\n".join(lines)
652 lines.clear()
654 logger(f"Now checking '[...]()' style urls in file {readme!r}.")
656 # now gather the links and remove them
657 start = -1
658 lines.clear()
659 while True:
660 start += 1
661 i = text.find("[", start)
662 if i < start:
663 lines.append(text[start:])
664 break
665 j = text.find("]", i + 1)
666 if j <= i:
667 break
668 if "\n" in text[i:j]:
669 lines.append(text[start:i])
670 start = i
671 continue
672 j += 1
673 if text[j] != "(":
674 raise __ve(f"Invalid [...](...) link in file {readme!r}", text, i)
675 k = text.find(")", j + 1)
676 if k <= j:
677 raise __ve("No closing gap for [...](...)"
678 f" link in file {readme!r}", text, i)
680 __check_url(text[j + 1:k], valid_urls)
681 total_links_checked += 1
683 lines.append(text[start:i])
684 start = k
686 text = "\n".join(lines)
687 lines.clear()
689 logger(f"Now checking ' href=' style urls in file {readme!r}.")
691 # now gather the href links and remove them
692 for quot in "'\"":
693 start = -1
694 lines.clear()
695 while True:
696 start += 1
697 start_str = f" href={quot}"
698 i = text.find(start_str, start)
699 if i < start:
700 lines.append(text[start:])
701 break
702 j = text.find(quot, i + len(start_str))
703 if j <= i:
704 break
705 if "\n" in text[i:j]:
706 lines.append(text[start:i])
707 start = i
708 continue
709 __check_url(text[i + len(start_str):j], valid_urls)
710 total_links_checked += 1
712 lines.append(text[start:i])
713 start = j
715 text = "\n".join(lines)
716 lines.clear()
718 logger(f"Now checking ' src=' style urls in file {readme!r}.")
719 # now gather the image links and remove them
720 for quot in "'\"":
721 start = -1
722 lines.clear()
723 while True:
724 start += 1
725 start_str = f" src={quot}"
726 i = text.find(start_str, start)
727 if i < start:
728 lines.append(text[start:])
729 break
730 j = text.find(quot, i + len(start_str))
731 if j <= i:
732 break
733 if "\n" in text[i:j]:
734 lines.append(text[start:i])
735 start = i
736 continue
737 __check_url(text[i + len(start_str):j], valid_urls)
738 total_links_checked += 1
740 lines.append(text[start:i])
741 start = j
743 text = "\n".join(lines)
744 lines.clear()
746 logger(f"Now checking '<...>' style urls in file {readme!r}.")
747 start = -1
748 lines.clear()
749 while True:
750 start += 1
751 i = text.find("<http", start)
752 if i < start:
753 lines.append(text[start:])
754 break
755 j = text.find(">", i + 1)
756 if j <= i:
757 break
758 if "\n" in text[i:j]:
759 lines.append(text[start:i])
760 start = i
761 continue
762 __check_url(text[i + 1:j], valid_urls)
763 total_links_checked += 1
765 lines.append(text[start:i])
766 start = j
768 if total_links_checked <= 0:
769 raise ValueError(f"Found no links in file {readme!r}.")
770 logger(f"Finished testing all links {total_links_checked} in "
771 f"file {readme!r}.")