Coverage for pycommons / dev / tests / links_in_md.py: 92%

309 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-11 03:04 +0000

1"""Test all the links in.""" 

2from os import environ 

3from random import randint 

4from time import sleep 

5from typing import Final, cast 

6 

7# noinspection PyPackageRequirements 

8from certifi import where 

9 

10# noinspection PyPackageRequirements 

11from urllib3 import PoolManager # type: ignore 

12 

13# noinspection PyPackageRequirements 

14from urllib3.response import HTTPResponse # type: ignore 

15 

16from pycommons.io.console import logger 

17from pycommons.io.path import UTF8, Path, file_path 

18from pycommons.net.url import URL 

19from pycommons.strings.string_tools import replace_str 

20from pycommons.types import check_int_range, type_error 

21 

22#: The hosts that somtimes are unreachable from my local machine. 

23#: When the test is executed in a GitHub workflow, all hosts should be 

24#: reachable, except sometimes our institute's website and fsf.org. 

25__SOMETIMES_UNREACHABLE_HOSTS: Final[set[str]] = \ 

26 {"fsf.org"} if "GITHUB_JOB" in environ else \ 

27 {"fsf.org", "img.shields.io", "pypi.org", "docs.python.org"} 

28 

29#: URLs that we never need to check because they are OK 

30__CORRECT_URLS: Final[set[str]] = { 

31 "https://example.com", "http://example.com", 

32 "https://github.com", "http://github.com", 

33 "https://www.acm.org/publications/policies/artifact-review" 

34 "-and-badging-current"} 

35 

36 

37def __ve(msg: str, text: str, idx: int) -> ValueError: 

38 """ 

39 Raise a value error for the given text piece. 

40 

41 :param msg: the message 

42 :param text: the string 

43 :param idx: the index 

44 :returns: a :class:`ValueError` ready to be raised 

45 :raises TypeError: if either argument is of the wrong type 

46 

47 >>> try: 

48 ... __ve(None, " ", 1) 

49 ... except TypeError as te: 

50 ... print(te) 

51 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

52 

53 >>> try: 

54 ... __ve(1, " ", 1) 

55 ... except TypeError as te: 

56 ... print(te) 

57 descriptor '__len__' requires a 'str' object but received a 'int' 

58 

59 >>> try: 

60 ... __ve("bla", None, 1) 

61 ... except TypeError as te: 

62 ... print(te) 

63 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

64 

65 >>> try: 

66 ... __ve("bla", 1, 1) 

67 ... except TypeError as te: 

68 ... print(te) 

69 descriptor '__len__' requires a 'str' object but received a 'int' 

70 

71 >>> try: 

72 ... __ve("bla", "txt", None) 

73 ... except TypeError as te: 

74 ... print(te) 

75 idx should be an instance of int but is None. 

76 

77 >>> try: 

78 ... __ve("bla", "txt", "x") 

79 ... except TypeError as te: 

80 ... print(te) 

81 idx should be an instance of int but is str, namely 'x'. 

82 

83 >>> print(repr(__ve("", "txt", 1))) 

84 ValueError('Empty message!') 

85 

86 >>> print(repr(__ve("msg", "", 1))) 

87 ValueError("Empty text '' for message 'msg'.") 

88 

89 >>> print(repr(__ve("msg", "txt", 5))) 

90 ValueError("Index 5 is outside of text of length 3 for message 'msg'.") 

91 

92 >>> print(repr(__ve("msg", "long text", 2))) 

93 ValueError("msg: '...long text...'") 

94 """ 

95 if str.__len__(msg) == 0: 

96 return ValueError("Empty message!") 

97 len_text: Final[int] = str.__len__(text) 

98 if len_text <= 0: 

99 return ValueError(f"Empty text {text!r} for message {msg!r}.") 

100 if not isinstance(idx, int): 

101 raise type_error(idx, "idx", int) 

102 if len_text <= idx: 

103 return ValueError(f"Index {idx} is outside of text of length" 

104 f" {len_text} for message {msg!r}.") 

105 piece = text[max(0, idx - 32):min(len_text, idx + 64)].strip() 

106 return ValueError(f"{msg}: '...{piece}...'") 

107 

108 

109def __make_headers() -> tuple[dict[str, str] | None, ...]: 

110 """ 

111 Make the headers. 

112 

113 :returns: the headers 

114 """ 

115 headers: list[dict[str, str] | None] = [None] 

116 headers.extend( 

117 {"User-Agent": ua} for ua in ( 

118 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:138.0) Gecko/20100101" 

119 " Firefox/138.0", 

120 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (" 

121 "KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0", 

122 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like" 

123 " Gecko) Chrome/136.0.0.0 Safari/537.36", 

124 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101" 

125 " Firefox/106.0", 

126 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " 

127 "Gecko) Chrome/109.0.0.0 Safari/537.36", 

128 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " 

129 "(KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0." 

130 "1518.55", 

131 "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 " 

132 "Version/12.16.2", 

133 "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) " 

134 "like Gecko", 

135 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/" 

136 "537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A", 

137 "Mozilla/5.0 (PLAYSTATION 3; 3.55)", 

138 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (" 

139 "KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/114.0.1823" 

140 ".901", 

141 "mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (" 

142 "khtml, like gecko) chrome/80.0.3987.87 safari/537.36 edg/80.0." 

143 "361.502", 

144 "Mozilla/5.0 (X11; Linux i686; rv:13.0) Gecko/13.0 Firefox/13.0", 

145 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " 

146 "Gecko) Ubuntu Chromium/80.0.3987.149 HeadlessChrome/80.0.3987." 

147 "149 Safari/537.36")) 

148 return tuple(headers) 

149 

150 

151#: The headers to use for the HTTP requests. 

152#: It seems that some websites may throttle requests. 

153#: Maybe by using different headers, we can escape this. 

154__HEADERS: Final[tuple[dict[str, str] | None, ...]] = __make_headers() 

155del __make_headers 

156 

157 

158def __needs_body(url: URL) -> bool: 

159 """ 

160 Check whether we need the body of the given url. 

161 

162 If the complete body of the document needs to be downloaded, this function 

163 returns `True`. This is the case, for example, if we are talking about 

164 html documents. In this case, we need to (later) scan for internal 

165 references, i.e., for stuff like `id="..."` attributes. However, if the 

166 url does not point to an HTML document, maybe a PDF, then we do not need 

167 the whole body and return `False`. In the latter case, it is sufficient to 

168 do a `HEAD` HTTP request, in the former case we need a full `GET`. 

169 

170 :param url: the url string 

171 :returns: `True` if the body is needed, `False` otherwise 

172 :raises TypeError: if `base_url` is not a string 

173 

174 >>> __needs_body(URL("http://www.github.com/")) 

175 True 

176 >>> __needs_body(URL("http://www.github.com")) 

177 True 

178 >>> __needs_body(URL("http://www.github.com/1.htm")) 

179 True 

180 >>> __needs_body(URL("http://www.github.com/1.html")) 

181 True 

182 >>> __needs_body(URL("http://www.github.com/1.jpg")) 

183 False 

184 >>> __needs_body(URL("http://www.github.com/1")) 

185 True 

186 

187 >>> try: 

188 ... __needs_body(None) 

189 ... except TypeError as te: 

190 ... print(str(te)[:59]) 

191 url should be an instance of pycommons.net.url.URL but is N 

192 

193 >>> try: 

194 ... __needs_body(1) 

195 ... except TypeError as te: 

196 ... print(str(te)[:59]) 

197 url should be an instance of pycommons.net.url.URL but is i 

198 """ 

199 if not isinstance(url, URL): 

200 raise type_error(url, "url", URL) 

201 return (url.path is None) or str.endswith( 

202 url.path, (".html", ".htm", "/")) or ("." not in url.path) 

203 

204 

205def __find_fragment_html(body: str, fragment: str, url: URL) -> None: 

206 r""" 

207 Check whether the fragment is contained in the body as ID. 

208 

209 :param body: the body that was loaded 

210 :param fragment: the fragment 

211 :param url: the url from which the body was loaded 

212 :raises TypeError: if `body`, `fragment`, or `url` are not all strings 

213 :raises ValueError: if `body` does not contain `fragment` as an ID 

214 somewhere 

215 

216 >>> __find_fragment_html("<p id='1'>bla</p>", "1", 

217 ... URL("http://example.com#1")) 

218 >>> __find_fragment_html("<p id=\"1\">bla</p>", "1", 

219 ... URL("http://example.com#1")) 

220 >>> __find_fragment_html("<p id=1>bla</p>", "1", 

221 ... URL("http://example.com#1")) 

222 

223 >>> try: 

224 ... __find_fragment_html(None, "1", URL("http://example.com#1")) 

225 ... except TypeError as te: 

226 ... print(te) 

227 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

228 

229 >>> try: 

230 ... __find_fragment_html(1, "1", URL("http://example.com#1")) 

231 ... except TypeError as te: 

232 ... print(te) 

233 descriptor '__len__' requires a 'str' object but received a 'int' 

234 

235 >>> try: 

236 ... __find_fragment_html("<p id='1'>bla</p>", None, 

237 ... URL("http://example.com#1")) 

238 ... except TypeError as te: 

239 ... print(te) 

240 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

241 

242 >>> try: 

243 ... __find_fragment_html("<p id='1'>bla</p>", 1, 

244 ... URL("http://example.com#1")) 

245 ... except TypeError as te: 

246 ... print(te) 

247 descriptor '__len__' requires a 'str' object but received a 'int' 

248 

249 >>> try: 

250 ... __find_fragment_html("<p id='1'>bla</p>", None, 

251 ... URL("http://example.com#1")) 

252 ... except TypeError as te: 

253 ... print(te) 

254 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

255 

256 >>> try: 

257 ... __find_fragment_html("<p id='1'>bla</p>", 1, 

258 ... URL("http://example.com#1")) 

259 ... except TypeError as te: 

260 ... print(te) 

261 descriptor '__len__' requires a 'str' object but received a 'int' 

262 

263 >>> try: 

264 ... __find_fragment_html("<p id='1'>bla</p>", "1", None) 

265 ... except TypeError as te: 

266 ... print(te) 

267 url should be an instance of pycommons.net.url.URL but is None. 

268 

269 >>> try: 

270 ... __find_fragment_html("<p id='1'>bla</p>", "1", 1) 

271 ... except TypeError as te: 

272 ... print(te) 

273 url should be an instance of pycommons.net.url.URL but is int, namely 1. 

274 

275 >>> try: 

276 ... __find_fragment_html("", "1", 

277 ... URL("http://example.com#1")) 

278 ... except ValueError as ve: 

279 ... print(ve) 

280 Empty body: ''. 

281 

282 >>> try: 

283 ... __find_fragment_html("<p id='1'>bla</p>", "", 

284 ... URL("http://example.com")) 

285 ... except ValueError as ve: 

286 ... print(ve) 

287 Empty fragment: ''. 

288 

289 >>> try: 

290 ... __find_fragment_html("<p id='1'>bla</p>", "1", 

291 ... URL("http://example.com")) 

292 ... except ValueError as ve: 

293 ... print(ve) 

294 Url 'http://example.com' does not end in fragment '1'. 

295 

296 >>> try: 

297 ... __find_fragment_html("<p id='x1'>bla</p>", "1", 

298 ... URL("http://example.com#1")) 

299 ... except ValueError as ve: 

300 ... print(str(ve)[:-4]) 

301 Did not find id='1' of 'http://example.com#1' in body "<p id='x1'>bla</ 

302 """ 

303 if str.__len__(body) <= 0: 

304 raise ValueError(f"Empty body: {body!r}.") 

305 if str.__len__(fragment) <= 0: 

306 raise ValueError(f"Empty fragment: {fragment!r}.") 

307 if not isinstance(url, URL): 

308 raise type_error(url, "url", URL) 

309 if not url.endswith(fragment): 

310 raise ValueError( 

311 f"Url {url!r} does not end in fragment {fragment!r}.") 

312 

313 for qt in ("", "'", '"'): 

314 if f"id={qt}{fragment}{qt}" in body: 

315 return 

316 

317 raise ValueError( 

318 f"Did not find id={fragment!r} of {url!r} in body {body!r}.") 

319 

320 

321def __check_url(urlstr: str, valid_urls: dict[str, str | None], 

322 http: PoolManager = PoolManager( 

323 cert_reqs="CERT_REQUIRED", ca_certs=where())) -> None: 

324 r""" 

325 Check whether a URL is valid and can be reached. 

326 

327 :param urlstr: the URL to be checked 

328 :param valid_urls: the set of valid urls 

329 :param http: the pool manager 

330 :raises TypeError: if any of the parameters is of the wrong type 

331 :raises ValueError: if the url `urlstr` cannot be loaded or if it has a 

332 fragment part that is not discovered in the body of the loaded 

333 document. 

334 

335 >>> vu = dict() 

336 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu) 

337 >>> __check_url("mailto:tweise@hfuu.edu.cn", vu) 

338 >>> __check_url("tweise@hfuu.edu.cn", vu) 

339 

340 >>> from contextlib import redirect_stdout 

341 

342 >>> with redirect_stdout(None): 

343 ... __check_url("https://thomasweise.github.io/pycommons", vu) 

344 ... __check_url("http://example.com/", vu) 

345 ... __check_url("https://thomasweise.github.io/pycommons/pycommons" 

346 ... ".io.html", vu) 

347 >>> __check_url("https://thomasweise.github.io/pycommons", vu) 

348 >>> __check_url( 

349 ... "https://thomasweise.github.io/pycommons/pycommons.io.html", vu) 

350 

351 >>> __check_url("https://thomasweise.github.io/pycommons/pycommons" 

352 ... ".io.html#pycommons.io.path.Path", vu) 

353 >>> __check_url("http://example.com", vu) 

354 

355 >>> try: 

356 ... __check_url("bwri435//sdfsdf:-@@", vu) 

357 ... except ValueError as ve: 

358 ... print(str(ve)[:50]) 

359 Error in url 'bwri435//sdfsdf:-@@': URL part 'bwri 

360 

361 >>> with redirect_stdout(None): 

362 ... try: 

363 ... __check_url( 

364 ... "https://thomasweise.github.io/sifwrwruS.jpg#34", vu) 

365 ... except ValueError as ve: 

366 ... s = str(ve) 

367 >>> print(s[:61]) 

368 Url 'https://thomasweise.github.io/sifwrwruS.jpg#34' does not 

369 

370 >>> with redirect_stdout(None): 

371 ... try: 

372 ... __check_url("ssh://u@thomasweise.github.io/sifwrwruSSXFd", vu) 

373 ... except ValueError as ve: 

374 ... s = str(ve) 

375 >>> print(s) 

376 Invalid scheme for url 'ssh://u@thomasweise.github.io/sifwrwruSSXFd'. 

377 

378 >>> with redirect_stdout(None): 

379 ... try: 

380 ... __check_url( 

381 ... "https://thomasweise.github.io/sifwrwruSSXFdfDX", vu) 

382 ... except ValueError as ve: 

383 ... s = str(ve) 

384 >>> s.endswith("returns code 404.") or s.startswith("Could not load url") 

385 True 

386 

387 >>> try: 

388 ... __check_url(None, dict()) 

389 ... except TypeError as te: 

390 ... print(te) 

391 descriptor '__len__' requires a 'str' object but received a 'NoneType' 

392 

393 >>> try: 

394 ... __check_url(1, dict()) 

395 ... except TypeError as te: 

396 ... print(te) 

397 descriptor '__len__' requires a 'str' object but received a 'int' 

398 

399 >>> try: 

400 ... __check_url("http://example.com", None) 

401 ... except TypeError as te: 

402 ... print(te) 

403 valid_urls should be an instance of dict but is None. 

404 

405 >>> try: 

406 ... __check_url("http://example.com", 1) 

407 ... except TypeError as te: 

408 ... print(te) 

409 valid_urls should be an instance of dict but is int, namely 1. 

410 

411 >>> try: 

412 ... __check_url("http://example.com", dict(), None) 

413 ... except TypeError as te: 

414 ... print(te) 

415 http should be an instance of urllib3.poolmanager.PoolManager but is None. 

416 

417 >>> try: 

418 ... __check_url("http://example.com", dict(), 1) 

419 ... except TypeError as te: 

420 ... print(str(te)[:50]) 

421 http should be an instance of urllib3.poolmanager. 

422 """ 

423 if not isinstance(valid_urls, dict): 

424 raise type_error(valid_urls, "valid_urls", dict) 

425 if not isinstance(http, PoolManager): 

426 raise type_error(http, "http", PoolManager) 

427 

428 if urlstr in valid_urls: 

429 return 

430 

431 try: 

432 url: Final[URL] = URL(urlstr) 

433 except ValueError as ve: 

434 raise ValueError(f"Error in url {urlstr!r}: {ve}") from None 

435 

436 if (url in __CORRECT_URLS) or (url in valid_urls): 

437 return 

438 if url.scheme == "mailto": 

439 return 

440 if not url.scheme.startswith("http"): 

441 raise ValueError(f"Invalid scheme for url {url!r}.") 

442 

443 needs_body: Final[bool] = __needs_body(url) 

444 

445 base_url: URL = url 

446 fragment: Final[str | None] = url.fragment 

447 if fragment is not None: 

448 base_url = URL(url[:url.index("#")]) 

449 if not needs_body: 

450 raise ValueError( 

451 f"Url {url!r} does not need body but has " 

452 f"fragment {url.fragment!r}?") 

453 if base_url in valid_urls: 

454 __find_fragment_html(valid_urls[base_url], fragment, url) 

455 return 

456 

457 code: int 

458 body: str | None = None 

459 method = "GET" if needs_body else "HEAD" 

460 error: BaseException | None = None 

461 response: HTTPResponse | None = None 

462 headers: Final[list[dict[str, str] | None]] = list(__HEADERS) 

463 header_count: int = 0 

464 

465# Sometimes, access to the URLs on GitHub fails. 

466# I think they probably throttle access from here. 

467# Therefore, we first do a request with 5s timeout and 0 retries. 

468# If that fails, we wait 2 seconds and try with timeout 8 and 3 retries. 

469# If that fails, we wait for 5s, then try with timeout 30 and 3 retries. 

470# If that fails too, we assume that the URL is really incorrect, which rarely 

471# should not be the case (justifying the many retries). 

472 for sleep_time, retries, timeout in ( 

473 (0, 0, 5), (2, 3, 8), (5, 3, 30)): 

474 if sleep_time > 0: 

475 sleep(sleep_time) 

476 

477# We try to get a random header to deal with the problem that some pages 

478# will not permit certain user agents. To handle this issue, we try to not 

479# use any user agent twice. We randomly pick a user agent and, if it fails, 

480# make sure to use all other user agents first before we use that one again. 

481 if header_count <= 0: 

482 header_count = len(headers) 

483 header_idx = randint(0, header_count - 1) # noqa: S311 

484 header: dict[str, str] | None = headers[header_idx] 

485 header_count -= 1 

486 headers[header_count], headers[header_idx] \ 

487 = header, headers[header_count] 

488 try: 

489 response = cast("HTTPResponse", http.request( 

490 method, base_url, timeout=timeout, redirect=True, 

491 retries=retries, headers=header)) 

492 if isinstance(response, HTTPResponse) and isinstance( 

493 response.status, int) and (response.status == 200): 

494 error = None 

495 break 

496 except BaseException as be: # noqa 

497 logger(f"Attempt sleep={sleep_time}, retries={retries}, " 

498 f"timeout={timeout}, error={str(be)!r}, and " 

499 f"header={header!r} for {base_url!r} gave {be}.") 

500 error = be 

501 

502 if error is not None: 

503 # sometimes, I cannot reach some hosts from here... 

504 if url.host in __SOMETIMES_UNREACHABLE_HOSTS: 

505 return # we will accept this here 

506 raise ValueError(f"Could not load url {url!r}.") from error 

507 

508 if not isinstance(response, HTTPResponse): # should be impossible... 

509 raise ValueError(f"Response {response} from url={url!r}?") # noqa 

510 

511 code = check_int_range(response.status, "response.status", 0, 10000) 

512 if needs_body: 

513 try: 

514 body = str.strip(response.data.decode(UTF8)) 

515 except BaseException as be: # noqa 

516 raise ValueError(f"Error in body of url {url!r}: {be}") from be 

517 

518 body_len: Final[int] = 0 if body is None else str.__len__(body) 

519 logger(f"Checked url {url!r} got code {code} for method {method!r} and " 

520 f"{body_len} chars.") 

521 if code != 200: 

522 raise ValueError(f"Url {url!r} returns code {code}.") 

523 

524 if needs_body and ((body is None) or (body_len <= 0)): 

525 raise ValueError( 

526 f"Stripped body for {url!r} / {base_url!r} is {body!r}?") 

527 

528 valid_urls[base_url] = body 

529 if url is not base_url: 

530 valid_urls[url] = body 

531 

532 if fragment is not None: 

533 __find_fragment_html(body, fragment, url) 

534 

535 

536def check_links_in_md(file: str) -> None: 

537 """ 

538 Test all the links in the given file. 

539 

540 :param file: the file to check 

541 """ 

542 # First, we load the file as a single string 

543 readme: Final[Path] = file_path(file) 

544 logger(f"Checking all links in the file {readme!r}.") 

545 

546 text: str = readme.read_all_str() 

547 text_len: int = str.__len__(text) 

548 logger(f"Got {text_len} characters from file {readme!r}.") 

549 if text_len <= 0: 

550 raise ValueError(f"{readme!r} file is empty?") 

551 

552 # remove all code blocks 

553 total_links_checked: int = 0 

554 start: int = -1 

555 lines: Final[list[str]] = [] 

556 while True: 

557 start += 1 

558 i: int = text.find("\n```", start) 

559 if i < start: 

560 lines.append(text[start:].strip()) 

561 break 

562 j: int = text.find("\n```", i + 1) 

563 if j < i: 

564 raise __ve("Multi-line code start without " 

565 f"end in file {readme!r}", text, i) 

566 k: int = text.find("\n", j + 1) 

567 if k < j: 

568 raise __ve(f"Code end without newline in file {readme!r}", 

569 text, i) 

570 lines.append(text[start:i].strip()) 

571 start = k 

572 

573 text = "\n".join(lines).strip() 

574 lines.clear() 

575 

576 # these are all urls that have been verified 

577 valid_urls: Final[dict[str, str | None]] = {} 

578 

579 # build the map of local reference marks 

580 start = -1 

581 while True: 

582 start += 1 

583 i = 0 if ((start == 0) and text.startswith("#")) \ 

584 else text.find("\n#", start) 

585 if i < start: 

586 break 

587 j = text.find(" ", i + 1) 

588 if (j < i) or (text[j - 1] != "#"): 

589 raise __ve("Headline without space after # " 

590 f"in file {readme!r}", text, i) 

591 k = text.find("\n", j + 1) 

592 if k < j: 

593 raise __ve(f"Headline without end in file {readme!r}", text, i) 

594 rid: str = text[j:k].strip().replace(" ", "-") 

595 for ch in ".:,()`/": 

596 rid = rid.replace(ch, "") 

597 rid = replace_str("--", "-", rid).lower() 

598 if (str.__len__(rid) <= 2) or ((rid[0] not in "123456789") and ( 

599 start > 0)) or ("-" not in rid): 

600 raise __ve(f"Invalid id {rid!r} in file {readme!r}", text, i) 

601 valid_urls[f"#{rid}"] = None 

602 start = k 

603 

604 # remove all inline code 

605 start = -1 

606 while True: 

607 start += 1 

608 i = text.find("`", start) 

609 if i < start: 

610 lines.append(text[start:].strip()) 

611 break 

612 j = text.find("`", i + 1) 

613 if j < i: 

614 raise __ve("Multi-line code start " 

615 f"without end in file {readme!r}", text, i) 

616 lines.append(text[start:i].strip()) 

617 start = j 

618 text = "\n".join(lines).strip() 

619 lines.clear() 

620 

621 logger(f"Now checking '![...]()' style urls in file {readme!r}.") 

622 

623 # now gather the links to images and remove them 

624 start = -1 

625 lines.clear() 

626 while True: 

627 start += 1 

628 i = text.find("![", start) 

629 if i < start: 

630 lines.append(text[start:]) 

631 break 

632 j = text.find("]", i + 1) 

633 if j <= i: 

634 break 

635 if "\n" in text[i:j]: 

636 start = i 

637 j += 1 

638 if text[j] != "(": 

639 raise __ve(f"Invalid image sequence in file {readme!r}", text, i) 

640 k = text.find(")", j + 1) 

641 if k <= j: 

642 raise __ve("No closing gap for image sequence " 

643 f"in file {readme!r}", text, i) 

644 

645 __check_url(text[j + 1:k], valid_urls) 

646 total_links_checked += 1 

647 

648 lines.append(text[start:i]) 

649 start = k 

650 

651 text = "\n".join(lines) 

652 lines.clear() 

653 

654 logger(f"Now checking '[...]()' style urls in file {readme!r}.") 

655 

656 # now gather the links and remove them 

657 start = -1 

658 lines.clear() 

659 while True: 

660 start += 1 

661 i = text.find("[", start) 

662 if i < start: 

663 lines.append(text[start:]) 

664 break 

665 j = text.find("]", i + 1) 

666 if j <= i: 

667 break 

668 if "\n" in text[i:j]: 

669 lines.append(text[start:i]) 

670 start = i 

671 continue 

672 j += 1 

673 if text[j] != "(": 

674 raise __ve(f"Invalid [...](...) link in file {readme!r}", text, i) 

675 k = text.find(")", j + 1) 

676 if k <= j: 

677 raise __ve("No closing gap for [...](...)" 

678 f" link in file {readme!r}", text, i) 

679 

680 __check_url(text[j + 1:k], valid_urls) 

681 total_links_checked += 1 

682 

683 lines.append(text[start:i]) 

684 start = k 

685 

686 text = "\n".join(lines) 

687 lines.clear() 

688 

689 logger(f"Now checking ' href=' style urls in file {readme!r}.") 

690 

691 # now gather the href links and remove them 

692 for quot in "'\"": 

693 start = -1 

694 lines.clear() 

695 while True: 

696 start += 1 

697 start_str = f" href={quot}" 

698 i = text.find(start_str, start) 

699 if i < start: 

700 lines.append(text[start:]) 

701 break 

702 j = text.find(quot, i + len(start_str)) 

703 if j <= i: 

704 break 

705 if "\n" in text[i:j]: 

706 lines.append(text[start:i]) 

707 start = i 

708 continue 

709 __check_url(text[i + len(start_str):j], valid_urls) 

710 total_links_checked += 1 

711 

712 lines.append(text[start:i]) 

713 start = j 

714 

715 text = "\n".join(lines) 

716 lines.clear() 

717 

718 logger(f"Now checking ' src=' style urls in file {readme!r}.") 

719 # now gather the image links and remove them 

720 for quot in "'\"": 

721 start = -1 

722 lines.clear() 

723 while True: 

724 start += 1 

725 start_str = f" src={quot}" 

726 i = text.find(start_str, start) 

727 if i < start: 

728 lines.append(text[start:]) 

729 break 

730 j = text.find(quot, i + len(start_str)) 

731 if j <= i: 

732 break 

733 if "\n" in text[i:j]: 

734 lines.append(text[start:i]) 

735 start = i 

736 continue 

737 __check_url(text[i + len(start_str):j], valid_urls) 

738 total_links_checked += 1 

739 

740 lines.append(text[start:i]) 

741 start = j 

742 

743 text = "\n".join(lines) 

744 lines.clear() 

745 

746 logger(f"Now checking '<...>' style urls in file {readme!r}.") 

747 start = -1 

748 lines.clear() 

749 while True: 

750 start += 1 

751 i = text.find("<http", start) 

752 if i < start: 

753 lines.append(text[start:]) 

754 break 

755 j = text.find(">", i + 1) 

756 if j <= i: 

757 break 

758 if "\n" in text[i:j]: 

759 lines.append(text[start:i]) 

760 start = i 

761 continue 

762 __check_url(text[i + 1:j], valid_urls) 

763 total_links_checked += 1 

764 

765 lines.append(text[start:i]) 

766 start = j 

767 

768 if total_links_checked <= 0: 

769 raise ValueError(f"Found no links in file {readme!r}.") 

770 logger(f"Finished testing all links {total_links_checked} in " 

771 f"file {readme!r}.")