Coverage for pycommons / dev / doc / process_md.py: 100%

50 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-11 03:04 +0000

1""" 

2Process a markdown file in order to make it useful for distribution. 

3 

4In order to let sphinx properly load and insert the `README.md` file into the 

5project's documentation, we need to process this file from the GitHub style 

6markdown to a variant suitable for the myst parser used in sphinx. While we 

7are at it, we can also turn absolute URLs from the GitHub-`README.md` file 

8that point to the documentation URL to relative URLs. 

9""" 

10 

11from re import Pattern, sub 

12from re import compile as re_compile 

13from typing import Any, Callable, Final, Iterable, Mapping, cast 

14 

15from pycommons.dev.url_replacer import make_url_replacer 

16from pycommons.types import type_error 

17 

18#: detects strings of the form [xyz](#123-bla) and gives \1=xyz and \2=bla 

19__FIX_LINKS: Final[Pattern] = re_compile(r"(\[.+?])\(#\d+-(.+?)\)") 

20 

21 

22def __process_markdown( 

23 source: Iterable[str], dest: Callable[[str], Any], 

24 line_processor: Callable[[str], str] = lambda s: s, 

25 discard_until: str | None = "## 1. Introduction") -> None: 

26 """ 

27 Process a markdown file in order to make it useful for distribution. 

28 

29 This process changes the GitHub-style markdown to a format that the myst 

30 parser, which is used by sphinx, can render properly. This involves 

31 several issues: 

32 

33 1. We discard the top-level heading. 

34 2. We need to move all sub-headings one step up. 

35 3. Furthermore, we can turn all absolute URLs pointing to the 

36 documentation website to local references starting with `./`. 

37 

38 :param source: the source line iterable 

39 :param dest: the destination callable receiving the output 

40 :param line_processor: an optional callable for processing lines 

41 :param discard_until: discard all strings until reaching this line. If 

42 this is `None`, all lines will be used. If this is not `None`, then 

43 this will be the first line to be forwarded to `dest`f 

44 

45 >>> lp = list() 

46 >>> lpp = make_url_replacer({"https://example.com/": "./"}, 

47 ... {"https://example.com/A": "xyz"}) 

48 >>> src = ["![image](https://example.com/1.jp)", 

49 ... "# This is `pycommons!`", 

50 ... "Table of contents", 

51 ... "## 1. Introduction", 

52 ... "blabla bla <https://example.com/A>!", 

53 ... "## 2. Some More Text", 

54 ... "We [also say](https://example.com/z/hello.txt) stuff.", 

55 ... "### 2.4. Code Example", 

56 ... "```", 

57 ... "But [not in code](https://example.com/z/hello.txt).", 

58 ... "```", 

59 ... "See also [here](#24-code-example)."] 

60 >>> __process_markdown(src, print, lpp) 

61 # 1. Introduction 

62 blabla bla <xyz>! 

63 # 2. Some More Text 

64 We [also say](./z/hello.txt) stuff. 

65 ## 2.4. Code Example 

66 ``` 

67 But [not in code](https://example.com/z/hello.txt). 

68 ``` 

69 See also [here](#24-code-example). 

70 

71 >>> try: 

72 ... __process_markdown(None, print, lambda x: x, "bla") 

73 ... except TypeError as te: 

74 ... print(te) 

75 source should be an instance of typing.Iterable but is None. 

76 

77 >>> try: 

78 ... __process_markdown(1, print, lambda x: x, "bla") 

79 ... except TypeError as te: 

80 ... print(te) 

81 source should be an instance of typing.Iterable but is int, namely 1. 

82 

83 >>> try: 

84 ... __process_markdown([None], print, lambda x: x, "bla") 

85 ... except TypeError as te: 

86 ... print(te) 

87 descriptor 'rstrip' for 'str' objects doesn't apply to a 'NoneType' object 

88 

89 >>> try: 

90 ... __process_markdown([1], print, lambda x: x, "bla") 

91 ... except TypeError as te: 

92 ... print(te) 

93 descriptor 'rstrip' for 'str' objects doesn't apply to a 'int' object 

94 

95 >>> try: 

96 ... __process_markdown([""], None, lambda x: x, "bla") 

97 ... except TypeError as te: 

98 ... print(te) 

99 dest should be a callable but is None. 

100 

101 >>> try: 

102 ... __process_markdown([""], 1, lambda x: x, "bla") 

103 ... except TypeError as te: 

104 ... print(te) 

105 dest should be a callable but is int, namely 1. 

106 

107 >>> try: 

108 ... __process_markdown([""], print, None, "bla") 

109 ... except TypeError as te: 

110 ... print(te) 

111 line_processor should be a callable but is None. 

112 

113 >>> try: 

114 ... __process_markdown([""], print, 1, "bla") 

115 ... except TypeError as te: 

116 ... print(te) 

117 line_processor should be a callable but is int, namely 1. 

118 

119 >>> try: 

120 ... __process_markdown([""], print, lambda x: x, 1) 

121 ... except TypeError as te: 

122 ... print(te) 

123 descriptor '__len__' requires a 'str' object but received a 'int' 

124 

125 >>> try: 

126 ... __process_markdown([""], print, lambda x: x, "") 

127 ... except ValueError as ve: 

128 ... print(ve) 

129 discard_until cannot be ''. 

130 

131 >>> __process_markdown([""], print, lambda x: x, None) 

132 <BLANKLINE> 

133 """ 

134 if not isinstance(source, Iterable): 

135 raise type_error(source, "source", Iterable) 

136 if not callable(dest): 

137 raise type_error(dest, "dest", call=True) 

138 if not callable(line_processor): 

139 raise type_error(line_processor, "line_processor", call=True) 

140 

141 skip: bool = False 

142 if discard_until is not None: 

143 if str.__len__(discard_until) <= 0: 

144 raise ValueError(f"discard_until cannot be {discard_until!r}.") 

145 skip = True 

146 else: 

147 discard_until = "" 

148 

149 in_code: bool = False # we only process non-code lines 

150 needs_newline: bool = False # required after image lines 

151 add_images_anyway: bool = True 

152 for the_line in source: 

153 line = str.rstrip(the_line) # enforce string 

154 

155 # we skip everything until the introduction section 

156 if skip: 

157 the_line_lstr = str.lstrip(the_line) 

158 if str.__len__(the_line_lstr) <= 0: 

159 continue 

160 if the_line_lstr.startswith(discard_until): 

161 skip = False 

162 elif the_line_lstr.startswith("[![") and add_images_anyway: 

163 needs_newline = True 

164 dest(line) 

165 continue 

166 else: 

167 add_images_anyway = False 

168 continue 

169 

170 if needs_newline: 

171 dest("") 

172 needs_newline = False 

173 

174 if in_code: 

175 if line.startswith("```"): 

176 in_code = False # toggle to non-code 

177 elif line.startswith("```"): 

178 in_code = True # toggle to code 

179 elif line.startswith("#"): 

180 line = line[1:] # move all sub-headings one step up 

181 else: # e.g., fix all urls via the line processor 

182 line = str.rstrip(line_processor(line)) 

183 

184 dest(line) 

185 

186 

187def process_markdown_for_sphinx( 

188 source: Iterable[str], dest: Callable[[str], Any], 

189 base_urls: Mapping[str, str] | None = None, 

190 full_urls: Mapping[str, str] | None = None, 

191 discard_until: str | None = "## 1. Introduction") -> None: 

192 """ 

193 Process a markdown file in order to make it useful for distribution. 

194 

195 This process changes the GitHub-style markdown to a format that the myst 

196 parser, which is used by sphinx, can render properly. This involves 

197 several issues: 

198 

199 1. We discard the top-level heading. 

200 2. We need to move all sub-headings one step up. 

201 3. Furthermore, we can turn all absolute URLs pointing to the 

202 documentation website to local references starting with `./`. 

203 4. The myst parser drops the numerical prefixes of links, i.e., it tags 

204 `## 1.2. Hello` with id `hello` instead of `12-hello`. This means that 

205 we need to fix all references following the pattern `[xxx](#12-hello)` 

206 to `[xxx](#hello)`. 

207 

208 :param source: the source line iterable 

209 :param dest: the destination callable receiving the output 

210 :param base_urls: a mapping of basic urls to shortcuts 

211 :param full_urls: a mapping of full urls to abbreviations 

212 :param discard_until: discard all strings until reaching this line. If 

213 this is `None`, all lines will be used. If this is not `None`, then 

214 this will be the first line to be forwarded to `dest` 

215 

216 >>> lp = list() 

217 >>> src = ["![image](https://example.com/1.jp)", 

218 ... "# This is `pycommons!`", 

219 ... "Table of contents", 

220 ... "## 1. Introduction", 

221 ... "blabla bla <https://example.com/A>!", 

222 ... "## 2. Some More Text", 

223 ... "We [also say](https://example.com/z/hello.txt) stuff.", 

224 ... "### 2.4. Code Example", 

225 ... "```", 

226 ... "But [not in code](https://example.com/z/hello.txt).", 

227 ... "```", 

228 ... "See also [here](#24-code-example)."] 

229 >>> process_markdown_for_sphinx(src, print, 

230 ... {"https://example.com/": "./"}, 

231 ... {"https://example.com/A": "xyz"}) 

232 # 1. Introduction 

233 blabla bla <xyz>! 

234 # 2. Some More Text 

235 We [also say](./z/hello.txt) stuff. 

236 ## 2.4. Code Example 

237 ``` 

238 But [not in code](https://example.com/z/hello.txt). 

239 ``` 

240 See also [here](#code-example). 

241 

242 >>> try: 

243 ... process_markdown_for_sphinx(None, print) 

244 ... except TypeError as te: 

245 ... print(te) 

246 source should be an instance of typing.Iterable but is None. 

247 

248 >>> try: 

249 ... process_markdown_for_sphinx(1, print) 

250 ... except TypeError as te: 

251 ... print(te) 

252 source should be an instance of typing.Iterable but is int, namely 1. 

253 

254 >>> try: 

255 ... process_markdown_for_sphinx([None], print) 

256 ... except TypeError as te: 

257 ... print(te) 

258 descriptor 'rstrip' for 'str' objects doesn't apply to a 'NoneType' object 

259 

260 >>> try: 

261 ... process_markdown_for_sphinx([1], print) 

262 ... except TypeError as te: 

263 ... print(te) 

264 descriptor 'rstrip' for 'str' objects doesn't apply to a 'int' object 

265 

266 >>> try: 

267 ... process_markdown_for_sphinx([""], None) 

268 ... except TypeError as te: 

269 ... print(te) 

270 dest should be a callable but is None. 

271 

272 >>> try: 

273 ... process_markdown_for_sphinx([""], 1) 

274 ... except TypeError as te: 

275 ... print(te) 

276 dest should be a callable but is int, namely 1. 

277 

278 >>> try: 

279 ... process_markdown_for_sphinx([""], print, 1, None, "bla") 

280 ... except TypeError as te: 

281 ... print(te) 

282 base_urls should be an instance of typing.Mapping but is int, namely 1. 

283 

284 >>> try: 

285 ... process_markdown_for_sphinx([""], print, None, 1, "bla") 

286 ... except TypeError as te: 

287 ... print(te) 

288 full_urls should be an instance of typing.Mapping but is int, namely 1. 

289 

290 >>> try: 

291 ... process_markdown_for_sphinx([""], print, None, None, 1) 

292 ... except TypeError as te: 

293 ... print(te) 

294 descriptor '__len__' requires a 'str' object but received a 'int' 

295 

296 >>> try: 

297 ... process_markdown_for_sphinx([""], print, None, None, "") 

298 ... except ValueError as ve: 

299 ... print(ve) 

300 discard_until cannot be ''. 

301 

302 >>> process_markdown_for_sphinx([""], print, None, None, None) 

303 <BLANKLINE> 

304 """ 

305 __process_markdown(source, dest, cast( 

306 "Callable[[str], str]", lambda s, __l=make_url_replacer( 

307 base_urls, full_urls): __l(sub(__FIX_LINKS, "\\1(#\\2)", 

308 s))), discard_until)