Coverage for pycommons / dev / tests / links_in_md.py: 94%
309 statements
« prev ^ index » next coverage.py v7.13.2, created at 2026-02-02 06:36 +0000
« prev ^ index » next coverage.py v7.13.2, created at 2026-02-02 06:36 +0000
1"""Test all the links in."""
2from os import environ
3from random import randint
4from time import sleep
5from typing import Final, cast
7# noinspection PyPackageRequirements
8from certifi import where
10# noinspection PyPackageRequirements
11from urllib3 import PoolManager # type: ignore
13# noinspection PyPackageRequirements
14from urllib3.response import HTTPResponse # type: ignore
16from pycommons.io.console import logger
17from pycommons.io.path import UTF8, Path, file_path
18from pycommons.net.url import URL
19from pycommons.strings.string_tools import replace_str
20from pycommons.types import check_int_range, type_error
22#: The hosts that somtimes are unreachable from my local machine.
23#: When the test is executed in a GitHub workflow, all hosts should be
24#: reachable, except sometimes our institute's website and fsf.org.
25__SOMETIMES_UNREACHABLE_HOSTS: Final[set[str]] = \
26 {"fsf.org"} if "GITHUB_JOB" in environ else \
27 {"fsf.org", "img.shields.io", "pypi.org", "docs.python.org"}
29#: URLs that we never need to check because they are OK
30__CORRECT_URLS: Final[set[str]] = {
31 "https://example.com", "http://example.com",
32 "https://github.com", "http://github.com",
33 ("https://www.acm.org/publications/policies/artifact-review"
34 "-and-badging-current")}
37def __ve(msg: str, text: str, idx: int) -> ValueError:
38 """
39 Raise a value error for the given text piece.
41 :param msg: the message
42 :param text: the string
43 :param idx: the index
44 :returns: a :class:`ValueError` ready to be raised
45 :raises TypeError: if either argument is of the wrong type
47 >>> try:
48 ... __ve(None, " ", 1)
49 ... except TypeError as te:
50 ... print(te)
51 descriptor '__len__' requires a 'str' object but received a 'NoneType'
53 >>> try:
54 ... __ve(1, " ", 1)
55 ... except TypeError as te:
56 ... print(te)
57 descriptor '__len__' requires a 'str' object but received a 'int'
59 >>> try:
60 ... __ve("bla", None, 1)
61 ... except TypeError as te:
62 ... print(te)
63 descriptor '__len__' requires a 'str' object but received a 'NoneType'
65 >>> try:
66 ... __ve("bla", 1, 1)
67 ... except TypeError as te:
68 ... print(te)
69 descriptor '__len__' requires a 'str' object but received a 'int'
71 >>> try:
72 ... __ve("bla", "txt", None)
73 ... except TypeError as te:
74 ... print(te)
75 idx should be an instance of int but is None.
77 >>> try:
78 ... __ve("bla", "txt", "x")
79 ... except TypeError as te:
80 ... print(te)
81 idx should be an instance of int but is str, namely 'x'.
83 >>> print(repr(__ve("", "txt", 1)))
84 ValueError('Empty message!')
86 >>> print(repr(__ve("msg", "", 1)))
87 ValueError("Empty text '' for message 'msg'.")
89 >>> print(repr(__ve("msg", "txt", 5)))
90 ValueError("Index 5 is outside of text of length 3 for message 'msg'.")
92 >>> print(repr(__ve("msg", "long text", 2)))
93 ValueError("msg: '...long text...'")
94 """
95 if str.__len__(msg) == 0:
96 return ValueError("Empty message!")
97 len_text: Final[int] = str.__len__(text)
98 if len_text <= 0:
99 return ValueError(f"Empty text {text!r} for message {msg!r}.")
100 if not isinstance(idx, int):
101 raise type_error(idx, "idx", int)
102 if len_text <= idx:
103 return ValueError(f"Index {idx} is outside of text of length"
104 f" {len_text} for message {msg!r}.")
105 piece = text[max(0, idx - 32):min(len_text, idx + 64)].strip()
106 return ValueError(f"{msg}: '...{piece}...'")
109def __make_headers() -> tuple[dict[str, str] | None, ...]:
110 """
111 Make the headers.
113 :returns: the headers
114 """
115 headers: list[dict[str, str] | None] = [None]
116 headers.extend(
117 {"User-Agent": ua} for ua in (
118 ("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) "
119 "Gecko/20100101 Firefox/138.0"),
120 ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ("
121 "KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
122 " Edg/136.0.0.0"),
123 ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like"
124 " Gecko) Chrome/136.0.0.0 Safari/537.36"),
125 ("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) "
126 "Gecko/20100101 Firefox/106.0"),
127 ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like"
128 " Gecko) Chrome/109.0.0.0 Safari/537.36"),
129 ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
130 "(KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0."
131 "1518.55"),
132 ("Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 "
133 "Version/12.16.2"),
134 ("Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) "
135 "like Gecko"),
136 ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/"
137 "537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A"),
138 "Mozilla/5.0 (PLAYSTATION 3; 3.55)",
139 ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ("
140 "KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/114.0."
141 "1823.901"),
142 ("mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 ("
143 "khtml, like gecko) chrome/80.0.3987.87 safari/537.36 edg/80.0."
144 "361.502"),
145 "Mozilla/5.0 (X11; Linux i686; rv:13.0) Gecko/13.0 Firefox/13.0",
146 ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML"
147 ", like Gecko) Ubuntu Chromium/80.0.3987.149 HeadlessChrome/"
148 "80.0.3987.149 Safari/537.36")))
149 return tuple(headers)
152#: The headers to use for the HTTP requests.
153#: It seems that some websites may throttle requests.
154#: Maybe by using different headers, we can escape this.
155__HEADERS: Final[tuple[dict[str, str] | None, ...]] = __make_headers()
156del __make_headers
159def __needs_body(url: URL) -> bool:
160 """
161 Check whether we need the body of the given url.
163 If the complete body of the document needs to be downloaded, this function
164 returns `True`. This is the case, for example, if we are talking about
165 html documents. In this case, we need to (later) scan for internal
166 references, i.e., for stuff like `id="..."` attributes. However, if the
167 url does not point to an HTML document, maybe a PDF, then we do not need
168 the whole body and return `False`. In the latter case, it is sufficient to
169 do a `HEAD` HTTP request, in the former case we need a full `GET`.
171 :param url: the url string
172 :returns: `True` if the body is needed, `False` otherwise
173 :raises TypeError: if `base_url` is not a string
175 >>> __needs_body(URL("http://www.github.com/"))
176 True
177 >>> __needs_body(URL("http://www.github.com"))
178 True
179 >>> __needs_body(URL("http://www.github.com/1.htm"))
180 True
181 >>> __needs_body(URL("http://www.github.com/1.html"))
182 True
183 >>> __needs_body(URL("http://www.github.com/1.jpg"))
184 False
185 >>> __needs_body(URL("http://www.github.com/1"))
186 True
188 >>> try:
189 ... __needs_body(None)
190 ... except TypeError as te:
191 ... print(str(te)[:59])
192 url should be an instance of pycommons.net.url.URL but is N
194 >>> try:
195 ... __needs_body(1)
196 ... except TypeError as te:
197 ... print(str(te)[:59])
198 url should be an instance of pycommons.net.url.URL but is i
199 """
200 if not isinstance(url, URL):
201 raise type_error(url, "url", URL)
202 return (url.path is None) or str.endswith(
203 url.path, (".html", ".htm", "/")) or ("." not in url.path)
206def __find_fragment_html(body: str, fragment: str, url: URL) -> None:
207 r"""
208 Check whether the fragment is contained in the body as ID.
210 :param body: the body that was loaded
211 :param fragment: the fragment
212 :param url: the url from which the body was loaded
213 :raises TypeError: if `body`, `fragment`, or `url` are not all strings
214 :raises ValueError: if `body` does not contain `fragment` as an ID
215 somewhere
217 >>> __find_fragment_html("<p id='1'>bla</p>", "1",
218 ... URL("http://example.com#1"))
219 >>> __find_fragment_html("<p id=\"1\">bla</p>", "1",
220 ... URL("http://example.com#1"))
221 >>> __find_fragment_html("<p id=1>bla</p>", "1",
222 ... URL("http://example.com#1"))
224 >>> try:
225 ... __find_fragment_html(None, "1", URL("http://example.com#1"))
226 ... except TypeError as te:
227 ... print(te)
228 descriptor '__len__' requires a 'str' object but received a 'NoneType'
230 >>> try:
231 ... __find_fragment_html(1, "1", URL("http://example.com#1"))
232 ... except TypeError as te:
233 ... print(te)
234 descriptor '__len__' requires a 'str' object but received a 'int'
236 >>> try:
237 ... __find_fragment_html("<p id='1'>bla</p>", None,
238 ... URL("http://example.com#1"))
239 ... except TypeError as te:
240 ... print(te)
241 descriptor '__len__' requires a 'str' object but received a 'NoneType'
243 >>> try:
244 ... __find_fragment_html("<p id='1'>bla</p>", 1,
245 ... URL("http://example.com#1"))
246 ... except TypeError as te:
247 ... print(te)
248 descriptor '__len__' requires a 'str' object but received a 'int'
250 >>> try:
251 ... __find_fragment_html("<p id='1'>bla</p>", None,
252 ... URL("http://example.com#1"))
253 ... except TypeError as te:
254 ... print(te)
255 descriptor '__len__' requires a 'str' object but received a 'NoneType'
257 >>> try:
258 ... __find_fragment_html("<p id='1'>bla</p>", 1,
259 ... URL("http://example.com#1"))
260 ... except TypeError as te:
261 ... print(te)
262 descriptor '__len__' requires a 'str' object but received a 'int'
264 >>> try:
265 ... __find_fragment_html("<p id='1'>bla</p>", "1", None)
266 ... except TypeError as te:
267 ... print(te)
268 url should be an instance of pycommons.net.url.URL but is None.
270 >>> try:
271 ... __find_fragment_html("<p id='1'>bla</p>", "1", 1)
272 ... except TypeError as te:
273 ... print(te)
274 url should be an instance of pycommons.net.url.URL but is int, namely 1.
276 >>> try:
277 ... __find_fragment_html("", "1",
278 ... URL("http://example.com#1"))
279 ... except ValueError as ve:
280 ... print(ve)
281 Empty body: ''.
283 >>> try:
284 ... __find_fragment_html("<p id='1'>bla</p>", "",
285 ... URL("http://example.com"))
286 ... except ValueError as ve:
287 ... print(ve)
288 Empty fragment: ''.
290 >>> try:
291 ... __find_fragment_html("<p id='1'>bla</p>", "1",
292 ... URL("http://example.com"))
293 ... except ValueError as ve:
294 ... print(ve)
295 Url 'http://example.com' does not end in fragment '1'.
297 >>> try:
298 ... __find_fragment_html("<p id='x1'>bla</p>", "1",
299 ... URL("http://example.com#1"))
300 ... except ValueError as ve:
301 ... print(str(ve)[:-4])
302 Did not find id='1' of 'http://example.com#1' in body "<p id='x1'>bla</
303 """
304 if str.__len__(body) <= 0:
305 raise ValueError(f"Empty body: {body!r}.")
306 if str.__len__(fragment) <= 0:
307 raise ValueError(f"Empty fragment: {fragment!r}.")
308 if not isinstance(url, URL):
309 raise type_error(url, "url", URL)
310 if not url.endswith(fragment):
311 raise ValueError(
312 f"Url {url!r} does not end in fragment {fragment!r}.")
314 for qt in ("", "'", '"'):
315 if f"id={qt}{fragment}{qt}" in body:
316 return
318 raise ValueError(
319 f"Did not find id={fragment!r} of {url!r} in body {body!r}.")
322def __check_url(urlstr: str, valid_urls: dict[str, str | None],
323 http: PoolManager = PoolManager(
324 cert_reqs="CERT_REQUIRED", ca_certs=where())) -> None:
325 r"""
326 Check whether a URL is valid and can be reached.
328 :param urlstr: the URL to be checked
329 :param valid_urls: the set of valid urls
330 :param http: the pool manager
331 :raises TypeError: if any of the parameters is of the wrong type
332 :raises ValueError: if the url `urlstr` cannot be loaded or if it has a
333 fragment part that is not discovered in the body of the loaded
334 document.
336 >>> vu = dict()
337 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu)
338 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu)
339 >>> __check_url("tweise@hfuu.edu.cn", vu)
341 >>> __check_url("https://thomasweise.github.io/pycommons/#introduction",
342 ... {})
344 >>> from contextlib import redirect_stdout
345 >>> with redirect_stdout(None): # check __SOMETIMES_UNREACHABLE
346 ... __check_url("https://fsf.org/111111111111111", vu)
348 >>> try:
349 ... with redirect_stdout(None):
350 ... __check_url("https://github.io.github.io/111111111", vu)
351 ... except ValueError as ve:
352 ... print(str(ve))
353 Could not load url 'https://github.io.github.io/111111111'.
355 >>> with redirect_stdout(None):
356 ... __check_url("https://thomasweise.github.io/pycommons", vu)
357 ... __check_url("http://example.com/", vu)
358 ... __check_url("https://thomasweise.github.io/pycommons/pycommons"
359 ... ".io.html", vu)
360 >>> __check_url("https://thomasweise.github.io/pycommons", vu)
361 >>> __check_url(
362 ... "https://thomasweise.github.io/pycommons/pycommons.io.html", vu)
364 >>> __check_url("https://thomasweise.github.io/pycommons/pycommons"
365 ... ".io.html#pycommons.io.path.Path", vu)
366 >>> __check_url("http://example.com", vu)
368 >>> try:
369 ... __check_url("bwri435//sdfsdf:-@@", vu)
370 ... except ValueError as ve:
371 ... print(str(ve)[:50])
372 Error in url 'bwri435//sdfsdf:-@@': URL part 'bwri
374 >>> with redirect_stdout(None):
375 ... try:
376 ... __check_url(
377 ... "https://thomasweise.github.io/sifwrwruS.jpg#34", vu)
378 ... except ValueError as ve:
379 ... s = str(ve)
380 >>> print(s[:61])
381 Url 'https://thomasweise.github.io/sifwrwruS.jpg#34' does not
383 >>> with redirect_stdout(None):
384 ... try:
385 ... __check_url("ssh://u@thomasweise.github.io/sifwrwruSSXFd", vu)
386 ... except ValueError as ve:
387 ... s = str(ve)
388 >>> print(s)
389 Invalid scheme for url 'ssh://u@thomasweise.github.io/sifwrwruSSXFd'.
391 >>> with redirect_stdout(None):
392 ... try:
393 ... __check_url(
394 ... "https://thomasweise.github.io/sifwrwruSSXFdfDX", vu)
395 ... except ValueError as ve:
396 ... s = str(ve)
397 >>> s.endswith("returns code 404.") or s.startswith("Could not load url")
398 True
400 >>> try:
401 ... __check_url(None, dict())
402 ... except TypeError as te:
403 ... print(te)
404 descriptor '__len__' requires a 'str' object but received a 'NoneType'
406 >>> try:
407 ... __check_url(1, dict())
408 ... except TypeError as te:
409 ... print(te)
410 descriptor '__len__' requires a 'str' object but received a 'int'
412 >>> try:
413 ... __check_url("http://example.com", None)
414 ... except TypeError as te:
415 ... print(te)
416 valid_urls should be an instance of dict but is None.
418 >>> try:
419 ... __check_url("http://example.com", 1)
420 ... except TypeError as te:
421 ... print(te)
422 valid_urls should be an instance of dict but is int, namely 1.
424 >>> try:
425 ... __check_url("http://example.com", dict(), None)
426 ... except TypeError as te:
427 ... print(te)
428 http should be an instance of urllib3.poolmanager.PoolManager but is None.
430 >>> try:
431 ... __check_url("http://example.com", dict(), 1)
432 ... except TypeError as te:
433 ... print(str(te)[:50])
434 http should be an instance of urllib3.poolmanager.
435 """
436 if not isinstance(valid_urls, dict):
437 raise type_error(valid_urls, "valid_urls", dict)
438 if not isinstance(http, PoolManager):
439 raise type_error(http, "http", PoolManager)
441 if urlstr in valid_urls:
442 return
444 try:
445 url: Final[URL] = URL(urlstr)
446 except ValueError as ve:
447 raise ValueError(f"Error in url {urlstr!r}: {ve}") from None
449 if (url in __CORRECT_URLS) or (url in valid_urls):
450 return
451 if url.scheme == "mailto":
452 return
453 if not url.scheme.startswith("http"):
454 raise ValueError(f"Invalid scheme for url {url!r}.")
456 needs_body: Final[bool] = __needs_body(url)
458 base_url: URL = url
459 fragment: Final[str | None] = url.fragment
460 if fragment is not None:
461 base_url = URL(url[:url.index("#")])
462 if not needs_body:
463 raise ValueError(
464 f"Url {url!r} does not need body but has "
465 f"fragment {url.fragment!r}?")
466 if base_url in valid_urls:
467 __find_fragment_html(valid_urls[base_url], fragment, url)
468 return
470 code: int
471 body: str | None = None
472 method = "GET" if needs_body else "HEAD"
473 error: BaseException | None = None
474 response: HTTPResponse | None = None
475 headers: Final[list[dict[str, str] | None]] = list(__HEADERS)
476 header_count: int = 0
478# Sometimes, access to the URLs on GitHub fails.
479# I think they probably throttle access from here.
480# Therefore, we first do a request with 5s timeout and 0 retries.
481# If that fails, we wait 2 seconds and try with timeout 8 and 3 retries.
482# If that fails, we wait for 5s, then try with timeout 30 and 3 retries.
483# If that fails too, we assume that the URL is really incorrect, which rarely
484# should not be the case (justifying the many retries).
485 for sleep_time, retries, timeout in (
486 (0, 0, 5), (2, 3, 8), (5, 3, 30)):
487 if sleep_time > 0:
488 sleep(sleep_time)
490# We try to get a random header to deal with the problem that some pages
491# will not permit certain user agents. To handle this issue, we try to not
492# use any user agent twice. We randomly pick a user agent and, if it fails,
493# make sure to use all other user agents first before we use that one again.
494 if header_count <= 0:
495 header_count = len(headers)
496 header_idx = randint(0, header_count - 1) # noqa: S311
497 header: dict[str, str] | None = headers[header_idx]
498 header_count -= 1
499 headers[header_count], headers[header_idx] \
500 = header, headers[header_count]
501 try:
502 response = cast("HTTPResponse", http.request(
503 method, base_url, timeout=timeout, redirect=True,
504 retries=retries, headers=header))
505 if isinstance(response, HTTPResponse) and isinstance(
506 response.status, int) and (response.status == 200):
507 error = None
508 break
509 except BaseException as be: # noqa
510 logger(f"Attempt sleep={sleep_time}, retries={retries}, "
511 f"timeout={timeout}, error={str(be)!r}, and "
512 f"header={header!r} for {base_url!r} gave {be}.")
513 error = be
515 if error is not None:
516 # sometimes, I cannot reach some hosts from here...
517 if url.host in __SOMETIMES_UNREACHABLE_HOSTS:
518 return # we will accept this here
519 raise ValueError(f"Could not load url {url!r}.") from error
521 if not isinstance(response, HTTPResponse): # should be impossible...
522 raise ValueError(f"Response {response} from url={url!r}?") # noqa
524 code = check_int_range(response.status, "response.status", 0, 10000)
525 if needs_body:
526 try:
527 body = str.strip(response.data.decode(UTF8))
528 except BaseException as be: # noqa
529 raise ValueError(f"Error in body of url {url!r}: {be}") from be
531 body_len: Final[int] = 0 if body is None else str.__len__(body)
532 logger(f"Checked url {url!r} got code {code} for method {method!r} and "
533 f"{body_len} chars.")
534 if code != 200:
535 raise ValueError(f"Url {url!r} returns code {code}.")
537 if needs_body and ((body is None) or (body_len <= 0)):
538 raise ValueError(
539 f"Stripped body for {url!r} / {base_url!r} is {body!r}?")
541 valid_urls[base_url] = body
542 if url is not base_url:
543 valid_urls[url] = body
545 if fragment is not None:
546 __find_fragment_html(body, fragment, url)
549def check_links_in_md(file: str) -> None:
550 """
551 Test all the links in the given file.
553 :param file: the file to check
555 >>> from pycommons.io.temp import temp_file
556 >>> with temp_file() as tf:
557 ... tf.write_all_str("[test](https://example.com)")
558 ... check_links_in_md(tf)
560 >>> with temp_file() as tf:
561 ... try:
562 ... check_links_in_md(tf)
563 ... except ValueError as ve:
564 ... print(str(ve)[-19:])
565 ' contains no text.
566 """
567 # First, we load the file as a single string
568 readme: Final[Path] = file_path(file)
569 logger(f"Checking all links in the file {readme!r}.")
571 text: str = readme.read_all_str()
572 text_len: int = str.__len__(text)
573 logger(f"Got {text_len} characters from file {readme!r}.")
574 if text_len <= 0:
575 raise ValueError(f"{readme!r} file is empty?")
577 # remove all code blocks
578 total_links_checked: int = 0
579 start: int = -1
580 lines: Final[list[str]] = []
581 while True:
582 start += 1
583 i: int = text.find("\n```", start)
584 if i < start:
585 lines.append(text[start:].strip())
586 break
587 j: int = text.find("\n```", i + 1)
588 if j < i:
589 raise __ve("Multi-line code start without "
590 f"end in file {readme!r}", text, i)
591 k: int = text.find("\n", j + 1)
592 if k < j:
593 raise __ve(f"Code end without newline in file {readme!r}",
594 text, i)
595 lines.append(text[start:i].strip())
596 start = k
598 text = "\n".join(lines).strip()
599 lines.clear()
601 # these are all urls that have been verified
602 valid_urls: Final[dict[str, str | None]] = {}
604 # build the map of local reference marks
605 start = -1
606 while True:
607 start += 1
608 i = 0 if ((start == 0) and text.startswith("#")) \
609 else text.find("\n#", start)
610 if i < start:
611 break
612 j = text.find(" ", i + 1)
613 if (j < i) or (text[j - 1] != "#"):
614 raise __ve("Headline without space after # "
615 f"in file {readme!r}", text, i)
616 k = text.find("\n", j + 1)
617 if k < j:
618 raise __ve(f"Headline without end in file {readme!r}", text, i)
619 rid: str = text[j:k].strip().replace(" ", "-")
620 for ch in ".:,()`/":
621 rid = rid.replace(ch, "")
622 rid = replace_str("--", "-", rid).lower()
623 if (str.__len__(rid) <= 2) or ((rid[0] not in "123456789") and (
624 start > 0)) or ("-" not in rid):
625 raise __ve(f"Invalid id {rid!r} in file {readme!r}", text, i)
626 valid_urls[f"#{rid}"] = None
627 start = k
629 # remove all inline code
630 start = -1
631 while True:
632 start += 1
633 i = text.find("`", start)
634 if i < start:
635 lines.append(text[start:].strip())
636 break
637 j = text.find("`", i + 1)
638 if j < i:
639 raise __ve("Multi-line code start "
640 f"without end in file {readme!r}", text, i)
641 lines.append(text[start:i].strip())
642 start = j
643 text = "\n".join(lines).strip()
644 lines.clear()
646 logger(f"Now checking '![...]()' style urls in file {readme!r}.")
648 # now gather the links to images and remove them
649 start = -1
650 lines.clear()
651 while True:
652 start += 1
653 i = text.find("![", start)
654 if i < start:
655 lines.append(text[start:])
656 break
657 j = text.find("]", i + 1)
658 if j <= i:
659 break
660 if "\n" in text[i:j]:
661 start = i
662 j += 1
663 if text[j] != "(":
664 raise __ve(f"Invalid image sequence in file {readme!r}", text, i)
665 k = text.find(")", j + 1)
666 if k <= j:
667 raise __ve("No closing gap for image sequence "
668 f"in file {readme!r}", text, i)
670 __check_url(text[j + 1:k], valid_urls)
671 total_links_checked += 1
673 lines.append(text[start:i])
674 start = k
676 text = "\n".join(lines)
677 lines.clear()
679 logger(f"Now checking '[...]()' style urls in file {readme!r}.")
681 # now gather the links and remove them
682 start = -1
683 lines.clear()
684 while True:
685 start += 1
686 i = text.find("[", start)
687 if i < start:
688 lines.append(text[start:])
689 break
690 j = text.find("]", i + 1)
691 if j <= i:
692 break
693 if "\n" in text[i:j]:
694 lines.append(text[start:i])
695 start = i
696 continue
697 j += 1
698 if text[j] != "(":
699 raise __ve(f"Invalid [...](...) link in file {readme!r}", text, i)
700 k = text.find(")", j + 1)
701 if k <= j:
702 raise __ve("No closing gap for [...](...)"
703 f" link in file {readme!r}", text, i)
705 __check_url(text[j + 1:k], valid_urls)
706 total_links_checked += 1
708 lines.append(text[start:i])
709 start = k
711 text = "\n".join(lines)
712 lines.clear()
714 logger(f"Now checking ' href=' style urls in file {readme!r}.")
716 # now gather the href links and remove them
717 for quot in "'\"":
718 start = -1
719 lines.clear()
720 while True:
721 start += 1
722 start_str = f" href={quot}"
723 i = text.find(start_str, start)
724 if i < start:
725 lines.append(text[start:])
726 break
727 j = text.find(quot, i + len(start_str))
728 if j <= i:
729 break
730 if "\n" in text[i:j]:
731 lines.append(text[start:i])
732 start = i
733 continue
734 __check_url(text[i + len(start_str):j], valid_urls)
735 total_links_checked += 1
737 lines.append(text[start:i])
738 start = j
740 text = "\n".join(lines)
741 lines.clear()
743 logger(f"Now checking ' src=' style urls in file {readme!r}.")
744 # now gather the image links and remove them
745 for quot in "'\"":
746 start = -1
747 lines.clear()
748 while True:
749 start += 1
750 start_str = f" src={quot}"
751 i = text.find(start_str, start)
752 if i < start:
753 lines.append(text[start:])
754 break
755 j = text.find(quot, i + len(start_str))
756 if j <= i:
757 break
758 if "\n" in text[i:j]:
759 lines.append(text[start:i])
760 start = i
761 continue
762 __check_url(text[i + len(start_str):j], valid_urls)
763 total_links_checked += 1
765 lines.append(text[start:i])
766 start = j
768 text = "\n".join(lines)
769 lines.clear()
771 logger(f"Now checking '<...>' style urls in file {readme!r}.")
772 start = -1
773 lines.clear()
774 while True:
775 start += 1
776 i = text.find("<http", start)
777 if i < start:
778 lines.append(text[start:])
779 break
780 j = text.find(">", i + 1)
781 if j <= i:
782 break
783 if "\n" in text[i:j]:
784 lines.append(text[start:i])
785 start = i
786 continue
787 __check_url(text[i + 1:j], valid_urls)
788 total_links_checked += 1
790 lines.append(text[start:i])
791 start = j
793 if total_links_checked <= 0:
794 raise ValueError(f"Found no links in file {readme!r}.")
795 logger(f"Finished testing all links {total_links_checked} in "
796 f"file {readme!r}.")