Coverage for pycommons / dev / tests / links_in_md.py: 94%

309 statements  

« prev     ^ index     » next       coverage.py v7.13.2, created at 2026-02-02 06:36 +0000

1"""Test all the links in.""" 

2from os import environ 

3from random import randint 

4from time import sleep 

5from typing import Final, cast 

6 

7# noinspection PyPackageRequirements 

8from certifi import where 

9 

10# noinspection PyPackageRequirements 

11from urllib3 import PoolManager # type: ignore 

12 

13# noinspection PyPackageRequirements 

14from urllib3.response import HTTPResponse # type: ignore 

15 

16from pycommons.io.console import logger 

17from pycommons.io.path import UTF8, Path, file_path 

18from pycommons.net.url import URL 

19from pycommons.strings.string_tools import replace_str 

20from pycommons.types import check_int_range, type_error 

21 

22#: The hosts that somtimes are unreachable from my local machine. 

23#: When the test is executed in a GitHub workflow, all hosts should be 

24#: reachable, except sometimes our institute's website and fsf.org. 

25__SOMETIMES_UNREACHABLE_HOSTS: Final[set[str]] = \ 

26 {"fsf.org"} if "GITHUB_JOB" in environ else \ 

27 {"fsf.org", "img.shields.io", "pypi.org", "docs.python.org"} 

28 

29#: URLs that we never need to check because they are OK 

30__CORRECT_URLS: Final[set[str]] = { 

31 "https://example.com", "http://example.com", 

32 "https://github.com", "http://github.com", 

33 ("https://www.acm.org/publications/policies/artifact-review" 

34 "-and-badging-current")} 

35 

36 

37def __ve(msg: str, text: str, idx: int) -> ValueError: 

38 """ 

39 Raise a value error for the given text piece. 

40 

41 :param msg: the message 

42 :param text: the string 

43 :param idx: the index 

44 :returns: a :class:`ValueError` ready to be raised 

45 :raises TypeError: if either argument is of the wrong type 

46 

47 >>> try: 

48 ... __ve(None, " ", 1) 

49 ... except TypeError as te: 

50 ... print(te) 

51 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

52 

53 >>> try: 

54 ... __ve(1, " ", 1) 

55 ... except TypeError as te: 

56 ... print(te) 

57 descriptor '__len__' requires a 'str' object but received a 'int' 

58 

59 >>> try: 

60 ... __ve("bla", None, 1) 

61 ... except TypeError as te: 

62 ... print(te) 

63 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

64 

65 >>> try: 

66 ... __ve("bla", 1, 1) 

67 ... except TypeError as te: 

68 ... print(te) 

69 descriptor '__len__' requires a 'str' object but received a 'int' 

70 

71 >>> try: 

72 ... __ve("bla", "txt", None) 

73 ... except TypeError as te: 

74 ... print(te) 

75 idx should be an instance of int but is None. 

76 

77 >>> try: 

78 ... __ve("bla", "txt", "x") 

79 ... except TypeError as te: 

80 ... print(te) 

81 idx should be an instance of int but is str, namely 'x'. 

82 

83 >>> print(repr(__ve("", "txt", 1))) 

84 ValueError('Empty message!') 

85 

86 >>> print(repr(__ve("msg", "", 1))) 

87 ValueError("Empty text '' for message 'msg'.") 

88 

89 >>> print(repr(__ve("msg", "txt", 5))) 

90 ValueError("Index 5 is outside of text of length 3 for message 'msg'.") 

91 

92 >>> print(repr(__ve("msg", "long text", 2))) 

93 ValueError("msg: '...long text...'") 

94 """ 

95 if str.__len__(msg) == 0: 

96 return ValueError("Empty message!") 

97 len_text: Final[int] = str.__len__(text) 

98 if len_text <= 0: 

99 return ValueError(f"Empty text {text!r} for message {msg!r}.") 

100 if not isinstance(idx, int): 

101 raise type_error(idx, "idx", int) 

102 if len_text <= idx: 

103 return ValueError(f"Index {idx} is outside of text of length" 

104 f" {len_text} for message {msg!r}.") 

105 piece = text[max(0, idx - 32):min(len_text, idx + 64)].strip() 

106 return ValueError(f"{msg}: '...{piece}...'") 

107 

108 

109def __make_headers() -> tuple[dict[str, str] | None, ...]: 

110 """ 

111 Make the headers. 

112 

113 :returns: the headers 

114 """ 

115 headers: list[dict[str, str] | None] = [None] 

116 headers.extend( 

117 {"User-Agent": ua} for ua in ( 

118 ("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) " 

119 "Gecko/20100101 Firefox/138.0"), 

120 ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (" 

121 "KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36" 

122 " Edg/136.0.0.0"), 

123 ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like" 

124 " Gecko) Chrome/136.0.0.0 Safari/537.36"), 

125 ("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) " 

126 "Gecko/20100101 Firefox/106.0"), 

127 ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like" 

128 " Gecko) Chrome/109.0.0.0 Safari/537.36"), 

129 ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " 

130 "(KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0." 

131 "1518.55"), 

132 ("Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 " 

133 "Version/12.16.2"), 

134 ("Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) " 

135 "like Gecko"), 

136 ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/" 

137 "537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A"), 

138 "Mozilla/5.0 (PLAYSTATION 3; 3.55)", 

139 ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (" 

140 "KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/114.0." 

141 "1823.901"), 

142 ("mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (" 

143 "khtml, like gecko) chrome/80.0.3987.87 safari/537.36 edg/80.0." 

144 "361.502"), 

145 "Mozilla/5.0 (X11; Linux i686; rv:13.0) Gecko/13.0 Firefox/13.0", 

146 ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML" 

147 ", like Gecko) Ubuntu Chromium/80.0.3987.149 HeadlessChrome/" 

148 "80.0.3987.149 Safari/537.36"))) 

149 return tuple(headers) 

150 

151 

152#: The headers to use for the HTTP requests. 

153#: It seems that some websites may throttle requests. 

154#: Maybe by using different headers, we can escape this. 

155__HEADERS: Final[tuple[dict[str, str] | None, ...]] = __make_headers() 

156del __make_headers 

157 

158 

159def __needs_body(url: URL) -> bool: 

160 """ 

161 Check whether we need the body of the given url. 

162 

163 If the complete body of the document needs to be downloaded, this function 

164 returns `True`. This is the case, for example, if we are talking about 

165 html documents. In this case, we need to (later) scan for internal 

166 references, i.e., for stuff like `id="..."` attributes. However, if the 

167 url does not point to an HTML document, maybe a PDF, then we do not need 

168 the whole body and return `False`. In the latter case, it is sufficient to 

169 do a `HEAD` HTTP request, in the former case we need a full `GET`. 

170 

171 :param url: the url string 

172 :returns: `True` if the body is needed, `False` otherwise 

173 :raises TypeError: if `base_url` is not a string 

174 

175 >>> __needs_body(URL("http://www.github.com/")) 

176 True 

177 >>> __needs_body(URL("http://www.github.com")) 

178 True 

179 >>> __needs_body(URL("http://www.github.com/1.htm")) 

180 True 

181 >>> __needs_body(URL("http://www.github.com/1.html")) 

182 True 

183 >>> __needs_body(URL("http://www.github.com/1.jpg")) 

184 False 

185 >>> __needs_body(URL("http://www.github.com/1")) 

186 True 

187 

188 >>> try: 

189 ... __needs_body(None) 

190 ... except TypeError as te: 

191 ... print(str(te)[:59]) 

192 url should be an instance of pycommons.net.url.URL but is N 

193 

194 >>> try: 

195 ... __needs_body(1) 

196 ... except TypeError as te: 

197 ... print(str(te)[:59]) 

198 url should be an instance of pycommons.net.url.URL but is i 

199 """ 

200 if not isinstance(url, URL): 

201 raise type_error(url, "url", URL) 

202 return (url.path is None) or str.endswith( 

203 url.path, (".html", ".htm", "/")) or ("." not in url.path) 

204 

205 

206def __find_fragment_html(body: str, fragment: str, url: URL) -> None: 

207 r""" 

208 Check whether the fragment is contained in the body as ID. 

209 

210 :param body: the body that was loaded 

211 :param fragment: the fragment 

212 :param url: the url from which the body was loaded 

213 :raises TypeError: if `body`, `fragment`, or `url` are not all strings 

214 :raises ValueError: if `body` does not contain `fragment` as an ID 

215 somewhere 

216 

217 >>> __find_fragment_html("<p id='1'>bla</p>", "1", 

218 ... URL("http://example.com#1")) 

219 >>> __find_fragment_html("<p id=\"1\">bla</p>", "1", 

220 ... URL("http://example.com#1")) 

221 >>> __find_fragment_html("<p id=1>bla</p>", "1", 

222 ... URL("http://example.com#1")) 

223 

224 >>> try: 

225 ... __find_fragment_html(None, "1", URL("http://example.com#1")) 

226 ... except TypeError as te: 

227 ... print(te) 

228 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

229 

230 >>> try: 

231 ... __find_fragment_html(1, "1", URL("http://example.com#1")) 

232 ... except TypeError as te: 

233 ... print(te) 

234 descriptor '__len__' requires a 'str' object but received a 'int' 

235 

236 >>> try: 

237 ... __find_fragment_html("<p id='1'>bla</p>", None, 

238 ... URL("http://example.com#1")) 

239 ... except TypeError as te: 

240 ... print(te) 

241 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

242 

243 >>> try: 

244 ... __find_fragment_html("<p id='1'>bla</p>", 1, 

245 ... URL("http://example.com#1")) 

246 ... except TypeError as te: 

247 ... print(te) 

248 descriptor '__len__' requires a 'str' object but received a 'int' 

249 

250 >>> try: 

251 ... __find_fragment_html("<p id='1'>bla</p>", None, 

252 ... URL("http://example.com#1")) 

253 ... except TypeError as te: 

254 ... print(te) 

255 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

256 

257 >>> try: 

258 ... __find_fragment_html("<p id='1'>bla</p>", 1, 

259 ... URL("http://example.com#1")) 

260 ... except TypeError as te: 

261 ... print(te) 

262 descriptor '__len__' requires a 'str' object but received a 'int' 

263 

264 >>> try: 

265 ... __find_fragment_html("<p id='1'>bla</p>", "1", None) 

266 ... except TypeError as te: 

267 ... print(te) 

268 url should be an instance of pycommons.net.url.URL but is None. 

269 

270 >>> try: 

271 ... __find_fragment_html("<p id='1'>bla</p>", "1", 1) 

272 ... except TypeError as te: 

273 ... print(te) 

274 url should be an instance of pycommons.net.url.URL but is int, namely 1. 

275 

276 >>> try: 

277 ... __find_fragment_html("", "1", 

278 ... URL("http://example.com#1")) 

279 ... except ValueError as ve: 

280 ... print(ve) 

281 Empty body: ''. 

282 

283 >>> try: 

284 ... __find_fragment_html("<p id='1'>bla</p>", "", 

285 ... URL("http://example.com")) 

286 ... except ValueError as ve: 

287 ... print(ve) 

288 Empty fragment: ''. 

289 

290 >>> try: 

291 ... __find_fragment_html("<p id='1'>bla</p>", "1", 

292 ... URL("http://example.com")) 

293 ... except ValueError as ve: 

294 ... print(ve) 

295 Url 'http://example.com' does not end in fragment '1'. 

296 

297 >>> try: 

298 ... __find_fragment_html("<p id='x1'>bla</p>", "1", 

299 ... URL("http://example.com#1")) 

300 ... except ValueError as ve: 

301 ... print(str(ve)[:-4]) 

302 Did not find id='1' of 'http://example.com#1' in body "<p id='x1'>bla</ 

303 """ 

304 if str.__len__(body) <= 0: 

305 raise ValueError(f"Empty body: {body!r}.") 

306 if str.__len__(fragment) <= 0: 

307 raise ValueError(f"Empty fragment: {fragment!r}.") 

308 if not isinstance(url, URL): 

309 raise type_error(url, "url", URL) 

310 if not url.endswith(fragment): 

311 raise ValueError( 

312 f"Url {url!r} does not end in fragment {fragment!r}.") 

313 

314 for qt in ("", "'", '"'): 

315 if f"id={qt}{fragment}{qt}" in body: 

316 return 

317 

318 raise ValueError( 

319 f"Did not find id={fragment!r} of {url!r} in body {body!r}.") 

320 

321 

322def __check_url(urlstr: str, valid_urls: dict[str, str | None], 

323 http: PoolManager = PoolManager( 

324 cert_reqs="CERT_REQUIRED", ca_certs=where())) -> None: 

325 r""" 

326 Check whether a URL is valid and can be reached. 

327 

328 :param urlstr: the URL to be checked 

329 :param valid_urls: the set of valid urls 

330 :param http: the pool manager 

331 :raises TypeError: if any of the parameters is of the wrong type 

332 :raises ValueError: if the url `urlstr` cannot be loaded or if it has a 

333 fragment part that is not discovered in the body of the loaded 

334 document. 

335 

336 >>> vu = dict() 

337 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu) 

338 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu) 

339 >>> __check_url("tweise@hfuu.edu.cn", vu) 

340 

341 >>> __check_url("https://thomasweise.github.io/pycommons/#introduction", 

342 ... {}) 

343 

344 >>> from contextlib import redirect_stdout 

345 >>> with redirect_stdout(None): # check __SOMETIMES_UNREACHABLE 

346 ... __check_url("https://fsf.org/111111111111111", vu) 

347 

348 >>> try: 

349 ... with redirect_stdout(None): 

350 ... __check_url("https://github.io.github.io/111111111", vu) 

351 ... except ValueError as ve: 

352 ... print(str(ve)) 

353 Could not load url 'https://github.io.github.io/111111111'. 

354 

355 >>> with redirect_stdout(None): 

356 ... __check_url("https://thomasweise.github.io/pycommons", vu) 

357 ... __check_url("http://example.com/", vu) 

358 ... __check_url("https://thomasweise.github.io/pycommons/pycommons" 

359 ... ".io.html", vu) 

360 >>> __check_url("https://thomasweise.github.io/pycommons", vu) 

361 >>> __check_url( 

362 ... "https://thomasweise.github.io/pycommons/pycommons.io.html", vu) 

363 

364 >>> __check_url("https://thomasweise.github.io/pycommons/pycommons" 

365 ... ".io.html#pycommons.io.path.Path", vu) 

366 >>> __check_url("http://example.com", vu) 

367 

368 >>> try: 

369 ... __check_url("bwri435//sdfsdf:-@@", vu) 

370 ... except ValueError as ve: 

371 ... print(str(ve)[:50]) 

372 Error in url 'bwri435//sdfsdf:-@@': URL part 'bwri 

373 

374 >>> with redirect_stdout(None): 

375 ... try: 

376 ... __check_url( 

377 ... "https://thomasweise.github.io/sifwrwruS.jpg#34", vu) 

378 ... except ValueError as ve: 

379 ... s = str(ve) 

380 >>> print(s[:61]) 

381 Url 'https://thomasweise.github.io/sifwrwruS.jpg#34' does not 

382 

383 >>> with redirect_stdout(None): 

384 ... try: 

385 ... __check_url("ssh://u@thomasweise.github.io/sifwrwruSSXFd", vu) 

386 ... except ValueError as ve: 

387 ... s = str(ve) 

388 >>> print(s) 

389 Invalid scheme for url 'ssh://u@thomasweise.github.io/sifwrwruSSXFd'. 

390 

391 >>> with redirect_stdout(None): 

392 ... try: 

393 ... __check_url( 

394 ... "https://thomasweise.github.io/sifwrwruSSXFdfDX", vu) 

395 ... except ValueError as ve: 

396 ... s = str(ve) 

397 >>> s.endswith("returns code 404.") or s.startswith("Could not load url") 

398 True 

399 

400 >>> try: 

401 ... __check_url(None, dict()) 

402 ... except TypeError as te: 

403 ... print(te) 

404 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

405 

406 >>> try: 

407 ... __check_url(1, dict()) 

408 ... except TypeError as te: 

409 ... print(te) 

410 descriptor '__len__' requires a 'str' object but received a 'int' 

411 

412 >>> try: 

413 ... __check_url("http://example.com", None) 

414 ... except TypeError as te: 

415 ... print(te) 

416 valid_urls should be an instance of dict but is None. 

417 

418 >>> try: 

419 ... __check_url("http://example.com", 1) 

420 ... except TypeError as te: 

421 ... print(te) 

422 valid_urls should be an instance of dict but is int, namely 1. 

423 

424 >>> try: 

425 ... __check_url("http://example.com", dict(), None) 

426 ... except TypeError as te: 

427 ... print(te) 

428 http should be an instance of urllib3.poolmanager.PoolManager but is None. 

429 

430 >>> try: 

431 ... __check_url("http://example.com", dict(), 1) 

432 ... except TypeError as te: 

433 ... print(str(te)[:50]) 

434 http should be an instance of urllib3.poolmanager. 

435 """ 

436 if not isinstance(valid_urls, dict): 

437 raise type_error(valid_urls, "valid_urls", dict) 

438 if not isinstance(http, PoolManager): 

439 raise type_error(http, "http", PoolManager) 

440 

441 if urlstr in valid_urls: 

442 return 

443 

444 try: 

445 url: Final[URL] = URL(urlstr) 

446 except ValueError as ve: 

447 raise ValueError(f"Error in url {urlstr!r}: {ve}") from None 

448 

449 if (url in __CORRECT_URLS) or (url in valid_urls): 

450 return 

451 if url.scheme == "mailto": 

452 return 

453 if not url.scheme.startswith("http"): 

454 raise ValueError(f"Invalid scheme for url {url!r}.") 

455 

456 needs_body: Final[bool] = __needs_body(url) 

457 

458 base_url: URL = url 

459 fragment: Final[str | None] = url.fragment 

460 if fragment is not None: 

461 base_url = URL(url[:url.index("#")]) 

462 if not needs_body: 

463 raise ValueError( 

464 f"Url {url!r} does not need body but has " 

465 f"fragment {url.fragment!r}?") 

466 if base_url in valid_urls: 

467 __find_fragment_html(valid_urls[base_url], fragment, url) 

468 return 

469 

470 code: int 

471 body: str | None = None 

472 method = "GET" if needs_body else "HEAD" 

473 error: BaseException | None = None 

474 response: HTTPResponse | None = None 

475 headers: Final[list[dict[str, str] | None]] = list(__HEADERS) 

476 header_count: int = 0 

477 

478# Sometimes, access to the URLs on GitHub fails. 

479# I think they probably throttle access from here. 

480# Therefore, we first do a request with 5s timeout and 0 retries. 

481# If that fails, we wait 2 seconds and try with timeout 8 and 3 retries. 

482# If that fails, we wait for 5s, then try with timeout 30 and 3 retries. 

483# If that fails too, we assume that the URL is really incorrect, which rarely 

484# should not be the case (justifying the many retries). 

485 for sleep_time, retries, timeout in ( 

486 (0, 0, 5), (2, 3, 8), (5, 3, 30)): 

487 if sleep_time > 0: 

488 sleep(sleep_time) 

489 

490# We try to get a random header to deal with the problem that some pages 

491# will not permit certain user agents. To handle this issue, we try to not 

492# use any user agent twice. We randomly pick a user agent and, if it fails, 

493# make sure to use all other user agents first before we use that one again. 

494 if header_count <= 0: 

495 header_count = len(headers) 

496 header_idx = randint(0, header_count - 1) # noqa: S311 

497 header: dict[str, str] | None = headers[header_idx] 

498 header_count -= 1 

499 headers[header_count], headers[header_idx] \ 

500 = header, headers[header_count] 

501 try: 

502 response = cast("HTTPResponse", http.request( 

503 method, base_url, timeout=timeout, redirect=True, 

504 retries=retries, headers=header)) 

505 if isinstance(response, HTTPResponse) and isinstance( 

506 response.status, int) and (response.status == 200): 

507 error = None 

508 break 

509 except BaseException as be: # noqa 

510 logger(f"Attempt sleep={sleep_time}, retries={retries}, " 

511 f"timeout={timeout}, error={str(be)!r}, and " 

512 f"header={header!r} for {base_url!r} gave {be}.") 

513 error = be 

514 

515 if error is not None: 

516 # sometimes, I cannot reach some hosts from here... 

517 if url.host in __SOMETIMES_UNREACHABLE_HOSTS: 

518 return # we will accept this here 

519 raise ValueError(f"Could not load url {url!r}.") from error 

520 

521 if not isinstance(response, HTTPResponse): # should be impossible... 

522 raise ValueError(f"Response {response} from url={url!r}?") # noqa 

523 

524 code = check_int_range(response.status, "response.status", 0, 10000) 

525 if needs_body: 

526 try: 

527 body = str.strip(response.data.decode(UTF8)) 

528 except BaseException as be: # noqa 

529 raise ValueError(f"Error in body of url {url!r}: {be}") from be 

530 

531 body_len: Final[int] = 0 if body is None else str.__len__(body) 

532 logger(f"Checked url {url!r} got code {code} for method {method!r} and " 

533 f"{body_len} chars.") 

534 if code != 200: 

535 raise ValueError(f"Url {url!r} returns code {code}.") 

536 

537 if needs_body and ((body is None) or (body_len <= 0)): 

538 raise ValueError( 

539 f"Stripped body for {url!r} / {base_url!r} is {body!r}?") 

540 

541 valid_urls[base_url] = body 

542 if url is not base_url: 

543 valid_urls[url] = body 

544 

545 if fragment is not None: 

546 __find_fragment_html(body, fragment, url) 

547 

548 

549def check_links_in_md(file: str) -> None: 

550 """ 

551 Test all the links in the given file. 

552 

553 :param file: the file to check 

554 

555 >>> from pycommons.io.temp import temp_file 

556 >>> with temp_file() as tf: 

557 ... tf.write_all_str("[test](https://example.com)") 

558 ... check_links_in_md(tf) 

559 

560 >>> with temp_file() as tf: 

561 ... try: 

562 ... check_links_in_md(tf) 

563 ... except ValueError as ve: 

564 ... print(str(ve)[-19:]) 

565 ' contains no text. 

566 """ 

567 # First, we load the file as a single string 

568 readme: Final[Path] = file_path(file) 

569 logger(f"Checking all links in the file {readme!r}.") 

570 

571 text: str = readme.read_all_str() 

572 text_len: int = str.__len__(text) 

573 logger(f"Got {text_len} characters from file {readme!r}.") 

574 if text_len <= 0: 

575 raise ValueError(f"{readme!r} file is empty?") 

576 

577 # remove all code blocks 

578 total_links_checked: int = 0 

579 start: int = -1 

580 lines: Final[list[str]] = [] 

581 while True: 

582 start += 1 

583 i: int = text.find("\n```", start) 

584 if i < start: 

585 lines.append(text[start:].strip()) 

586 break 

587 j: int = text.find("\n```", i + 1) 

588 if j < i: 

589 raise __ve("Multi-line code start without " 

590 f"end in file {readme!r}", text, i) 

591 k: int = text.find("\n", j + 1) 

592 if k < j: 

593 raise __ve(f"Code end without newline in file {readme!r}", 

594 text, i) 

595 lines.append(text[start:i].strip()) 

596 start = k 

597 

598 text = "\n".join(lines).strip() 

599 lines.clear() 

600 

601 # these are all urls that have been verified 

602 valid_urls: Final[dict[str, str | None]] = {} 

603 

604 # build the map of local reference marks 

605 start = -1 

606 while True: 

607 start += 1 

608 i = 0 if ((start == 0) and text.startswith("#")) \ 

609 else text.find("\n#", start) 

610 if i < start: 

611 break 

612 j = text.find(" ", i + 1) 

613 if (j < i) or (text[j - 1] != "#"): 

614 raise __ve("Headline without space after # " 

615 f"in file {readme!r}", text, i) 

616 k = text.find("\n", j + 1) 

617 if k < j: 

618 raise __ve(f"Headline without end in file {readme!r}", text, i) 

619 rid: str = text[j:k].strip().replace(" ", "-") 

620 for ch in ".:,()`/": 

621 rid = rid.replace(ch, "") 

622 rid = replace_str("--", "-", rid).lower() 

623 if (str.__len__(rid) <= 2) or ((rid[0] not in "123456789") and ( 

624 start > 0)) or ("-" not in rid): 

625 raise __ve(f"Invalid id {rid!r} in file {readme!r}", text, i) 

626 valid_urls[f"#{rid}"] = None 

627 start = k 

628 

629 # remove all inline code 

630 start = -1 

631 while True: 

632 start += 1 

633 i = text.find("`", start) 

634 if i < start: 

635 lines.append(text[start:].strip()) 

636 break 

637 j = text.find("`", i + 1) 

638 if j < i: 

639 raise __ve("Multi-line code start " 

640 f"without end in file {readme!r}", text, i) 

641 lines.append(text[start:i].strip()) 

642 start = j 

643 text = "\n".join(lines).strip() 

644 lines.clear() 

645 

646 logger(f"Now checking '![...]()' style urls in file {readme!r}.") 

647 

648 # now gather the links to images and remove them 

649 start = -1 

650 lines.clear() 

651 while True: 

652 start += 1 

653 i = text.find("![", start) 

654 if i < start: 

655 lines.append(text[start:]) 

656 break 

657 j = text.find("]", i + 1) 

658 if j <= i: 

659 break 

660 if "\n" in text[i:j]: 

661 start = i 

662 j += 1 

663 if text[j] != "(": 

664 raise __ve(f"Invalid image sequence in file {readme!r}", text, i) 

665 k = text.find(")", j + 1) 

666 if k <= j: 

667 raise __ve("No closing gap for image sequence " 

668 f"in file {readme!r}", text, i) 

669 

670 __check_url(text[j + 1:k], valid_urls) 

671 total_links_checked += 1 

672 

673 lines.append(text[start:i]) 

674 start = k 

675 

676 text = "\n".join(lines) 

677 lines.clear() 

678 

679 logger(f"Now checking '[...]()' style urls in file {readme!r}.") 

680 

681 # now gather the links and remove them 

682 start = -1 

683 lines.clear() 

684 while True: 

685 start += 1 

686 i = text.find("[", start) 

687 if i < start: 

688 lines.append(text[start:]) 

689 break 

690 j = text.find("]", i + 1) 

691 if j <= i: 

692 break 

693 if "\n" in text[i:j]: 

694 lines.append(text[start:i]) 

695 start = i 

696 continue 

697 j += 1 

698 if text[j] != "(": 

699 raise __ve(f"Invalid [...](...) link in file {readme!r}", text, i) 

700 k = text.find(")", j + 1) 

701 if k <= j: 

702 raise __ve("No closing gap for [...](...)" 

703 f" link in file {readme!r}", text, i) 

704 

705 __check_url(text[j + 1:k], valid_urls) 

706 total_links_checked += 1 

707 

708 lines.append(text[start:i]) 

709 start = k 

710 

711 text = "\n".join(lines) 

712 lines.clear() 

713 

714 logger(f"Now checking ' href=' style urls in file {readme!r}.") 

715 

716 # now gather the href links and remove them 

717 for quot in "'\"": 

718 start = -1 

719 lines.clear() 

720 while True: 

721 start += 1 

722 start_str = f" href={quot}" 

723 i = text.find(start_str, start) 

724 if i < start: 

725 lines.append(text[start:]) 

726 break 

727 j = text.find(quot, i + len(start_str)) 

728 if j <= i: 

729 break 

730 if "\n" in text[i:j]: 

731 lines.append(text[start:i]) 

732 start = i 

733 continue 

734 __check_url(text[i + len(start_str):j], valid_urls) 

735 total_links_checked += 1 

736 

737 lines.append(text[start:i]) 

738 start = j 

739 

740 text = "\n".join(lines) 

741 lines.clear() 

742 

743 logger(f"Now checking ' src=' style urls in file {readme!r}.") 

744 # now gather the image links and remove them 

745 for quot in "'\"": 

746 start = -1 

747 lines.clear() 

748 while True: 

749 start += 1 

750 start_str = f" src={quot}" 

751 i = text.find(start_str, start) 

752 if i < start: 

753 lines.append(text[start:]) 

754 break 

755 j = text.find(quot, i + len(start_str)) 

756 if j <= i: 

757 break 

758 if "\n" in text[i:j]: 

759 lines.append(text[start:i]) 

760 start = i 

761 continue 

762 __check_url(text[i + len(start_str):j], valid_urls) 

763 total_links_checked += 1 

764 

765 lines.append(text[start:i]) 

766 start = j 

767 

768 text = "\n".join(lines) 

769 lines.clear() 

770 

771 logger(f"Now checking '<...>' style urls in file {readme!r}.") 

772 start = -1 

773 lines.clear() 

774 while True: 

775 start += 1 

776 i = text.find("<http", start) 

777 if i < start: 

778 lines.append(text[start:]) 

779 break 

780 j = text.find(">", i + 1) 

781 if j <= i: 

782 break 

783 if "\n" in text[i:j]: 

784 lines.append(text[start:i]) 

785 start = i 

786 continue 

787 __check_url(text[i + 1:j], valid_urls) 

788 total_links_checked += 1 

789 

790 lines.append(text[start:i]) 

791 start = j 

792 

793 if total_links_checked <= 0: 

794 raise ValueError(f"Found no links in file {readme!r}.") 

795 logger(f"Finished testing all links {total_links_checked} in " 

796 f"file {readme!r}.")