Coverage for bookbuilderpy/format_python.py: 93%

143 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-17 23:15 +0000

1"""A formatter for python code.""" 

2import io 

3import sys 

4import token 

5import tokenize 

6from typing import Iterable 

7 

8import regex as reg # type: ignore 

9import strip_hints as sh # type: ignore 

10import yapf # type: ignore 

11 

12from bookbuilderpy.source_tools import ( 

13 format_empty_lines, 

14 select_lines, 

15 strip_common_whitespace_prefix, 

16) 

17from bookbuilderpy.strings import lines_to_str, str_to_lines 

18from bookbuilderpy.types import type_error 

19 

20 

21def __no_empty_after(line: str) -> bool: 

22 """ 

23 No empty line is permitted after definition. 

24 

25 :param line: the line 

26 :return: a boolean value 

27 >>> __no_empty_after("def ") 

28 True 

29 >>> __no_empty_after("import ") 

30 True 

31 >>> __no_empty_after("from ") 

32 True 

33 >>> __no_empty_after("def") 

34 False 

35 >>> __no_empty_after("import") 

36 False 

37 >>> __no_empty_after("from") 

38 False 

39 """ 

40 return line.startswith(("def ", "import ", "from ")) 

41 

42 

43def __empty_before(line: str) -> bool: 

44 """ 

45 Check whether an empty line is needed before this one. 

46 

47 :param line: the line 

48 :return: a boolean value 

49 >>> __empty_before("def") 

50 False 

51 >>> __empty_before("def ") 

52 True 

53 >>> __empty_before("class") 

54 False 

55 >>> __empty_before("class ") 

56 True 

57 """ 

58 return line.startswith(("def ", "class ")) 

59 

60 

61def __force_no_empty_after(line: str) -> bool: 

62 """ 

63 Really no empty line is permitted after definition. 

64 

65 :param line: the line 

66 :return: a boolean value 

67 >>> __force_no_empty_after("@") 

68 True 

69 """ 

70 return line.startswith("@") 

71 

72 

73#: the internal style for formatting Python code 

74__YAPF_STYLE = yapf.style.CreatePEP8Style() 

75__YAPF_STYLE["ARITHMETIC_PRECEDENCE_INDICATION"] = True 

76__YAPF_STYLE["BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION"] = 1 

77__YAPF_STYLE["BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF"] = False 

78__YAPF_STYLE["COALESCE_BRACKETS"] = True 

79__YAPF_STYLE["COLUMN_LIMIT"] = 74 

80__YAPF_STYLE["EACH_DICT_ENTRY_ON_SEPARATE_LINE"] = False 

81__YAPF_STYLE["SPLIT_BEFORE_NAMED_ASSIGNS"] = False 

82 

83 

84def __format_lines(code: str) -> str: 

85 r""" 

86 Format Python code lines. 

87 

88 :param code: the original code 

89 :return: the formatted lines. 

90 

91 >>> __format_lines("\ndef a():\n return 7- 45\n\n") 

92 'def a():\n return 7 - 45' 

93 >>> __format_lines("\n\n \nclass b:\n def bb(self): x =3/a()") 

94 'class b:\n def bb(self):\n x = 3 / a()' 

95 """ 

96 return yapf.yapf_api.FormatCode(code, 

97 style_config=__YAPF_STYLE)[0].rstrip() 

98 

99 

100#: the regexes stripping comments that occupy a complete line 

101__REGEX_STRIP_LINE_COMMENT: reg.Regex = reg.compile( 

102 "\\n[ \\t]*?#.*?\\n", 

103 flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101 

104 

105 

106def __strip_hints(code: str, 

107 strip_comments: bool = False) -> str: 

108 r""" 

109 Strip all type hints from the given code string. 

110 

111 :param code: the code string 

112 :return: the stripped code string 

113 >>> __format_lines(__strip_hints( 

114 ... "a: int = 7\ndef b(c: int) -> List[int]:\n return [4]")) 

115 'a = 7\n\ndef b(c):\n return [4]' 

116 """ 

117 new_text: str = sh.strip_string_to_string(code, 

118 strip_nl=True, 

119 to_empty=True) 

120 

121 # If we have single lines with type hints only, the above will turn 

122 # them into line comments. We need to get rid of those. 

123 

124 if strip_comments: 

125 # In the ideal case, we want to strip all comments anyway. 

126 # Then we do not need to bother with anything complex and can 

127 # directly use a regular expression getting rid of them. 

128 new_text2 = None 

129 while new_text2 != new_text: 

130 new_text2 = new_text 

131 new_text = reg.sub(__REGEX_STRIP_LINE_COMMENT, "\n", new_text) 

132 return new_text 

133 

134 # If we should preserve normal comments, all we can do is trying to 

135 # find these "new" comments in a very pedestrian fashion. 

136 orig_lines: list[str] = code.splitlines() 

137 new_lines: list[str] = new_text.splitlines() 

138 for i in range(min(len(orig_lines), len(new_lines)) - 1, -1, -1): 

139 t1: str = orig_lines[i].strip() 

140 t2: str = new_lines[i].strip() 

141 if t2.startswith("#") and (not t1.startswith("#")) \ 

142 and t2.endswith(t1): 

143 del new_lines[i] 

144 return lines_to_str(new_lines, trailing_newline=False) 

145 

146 

147def __strip_docstrings_and_comments(code: str, 

148 strip_docstrings: bool = True, 

149 strip_comments: bool = True) -> str: 

150 r""" 

151 Remove all docstrings and comments from a string. 

152 

153 :param code: the code 

154 :param strip_docstrings: should we delete docstrings? 

155 :param strip_comments: should we delete comments? 

156 :return: the stripped string 

157 

158 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, False) 

159 'a = 5# bla\n' 

160 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, True) 

161 'a = 5\n' 

162 >>> __strip_docstrings_and_comments('def b():\n \"\"\"bla!\"\"\"', True) 

163 'def b():\n ' 

164 >>> __strip_docstrings_and_comments('# 1\na = 5\n# 2\nb = 6\n') 

165 'a = 5\nb = 6\n' 

166 """ 

167 # First, we strip line comments that are hard to catch correctly with 

168 # the tokenization approach later. 

169 if strip_comments: 

170 code2 = None 

171 while code2 != code: 

172 code2 = code 

173 code = reg.sub(__REGEX_STRIP_LINE_COMMENT, "\n", code) 

174 del code2 

175 

176 # Now we strip the doc strings and remaining comments. 

177 prev_toktype: int = token.INDENT 

178 last_lineno: int = -1 

179 last_col: int = 0 

180 eat_newline: int = 0 

181 with io.StringIO() as output: 

182 with io.StringIO(code) as reader: 

183 for toktype, tttext, (slineno, scol), (telineno, ecol), _ in \ 

184 tokenize.generate_tokens(reader.readline): 

185 elineno = telineno 

186 ttext = tttext 

187 eat_newline -= 1 

188 if slineno > last_lineno: 

189 last_col = 0 

190 if scol > last_col: 

191 output.write(" " * (scol - last_col)) 

192 if (toktype == token.STRING) and \ 

193 (prev_toktype in (token.INDENT, token.NEWLINE)): 

194 if strip_docstrings: 

195 ttext = "" 

196 eat_newline = 1 

197 elif toktype == tokenize.COMMENT: 

198 if strip_comments: 

199 ttext = "" 

200 elif toktype == tokenize.NEWLINE and eat_newline >= 0: 

201 ttext = "" 

202 elineno += 1 

203 output.write(ttext) 

204 prev_toktype = toktype 

205 last_col = ecol 

206 last_lineno = elineno 

207 

208 result = output.getvalue() 

209 

210 # remove leading newlines 

211 while result: 

212 if result[0] == "\n": 

213 result = result[1:] 

214 continue 

215 return result 

216 

217 raise ValueError(f"code {code} becomes empty after docstring " 

218 "and comment stripping!") 

219 

220 

221def format_python(code: Iterable[str], 

222 strip_docstrings: bool = True, 

223 strip_comments: bool = True, 

224 strip_hints: bool = True) -> list[str]: 

225 """ 

226 Format a python code fragment. 

227 

228 :param code: the code fragment 

229 :param strip_docstrings: should we delete docstrings? 

230 :param strip_comments: should we delete comments? 

231 :param strip_hints: should we delete type hints? 

232 :return: the formatted code 

233 """ 

234 if not isinstance(code, Iterable): 

235 raise type_error(code, "code", Iterable) 

236 if not isinstance(strip_docstrings, bool): 

237 raise type_error(strip_docstrings, "strip_docstrings", bool) 

238 if not isinstance(strip_comments, bool): 

239 raise type_error(strip_comments, "strip_comments", bool) 

240 if not isinstance(strip_hints, bool): 

241 raise type_error(strip_hints, "strip_hints", bool) 

242 

243 old_len: tuple[int, int] = (sys.maxsize, sys.maxsize) 

244 

245 shortest: list[str] = list(code) 

246 rcode: list[str] = shortest 

247 not_first_run: bool = False 

248 while True: 

249 rcode = strip_common_whitespace_prefix(format_empty_lines( 

250 lines=rcode, 

251 empty_before=__empty_before, 

252 no_empty_after=__no_empty_after, 

253 force_no_empty_after=__force_no_empty_after, 

254 max_consecutive_empty_lines=1)) 

255 if len(rcode) <= 0: 

256 raise ValueError("Code becomes empty.") 

257 

258 text = lines_to_str(rcode) 

259 new_len: tuple[int, int] = (text.count("\n"), len(text)) 

260 if not_first_run and (old_len <= new_len): 

261 break 

262 shortest = rcode 

263 old_len = new_len 

264 

265 text = __format_lines(text) 

266 ntext = text 

267 if strip_docstrings or strip_comments: 

268 ntext = __strip_docstrings_and_comments( 

269 text, strip_docstrings=strip_docstrings, 

270 strip_comments=strip_comments).rstrip() 

271 if strip_hints: 

272 ntext = __strip_hints(ntext, 

273 strip_comments=strip_comments) 

274 if ntext != text: 

275 text = __format_lines(ntext) 

276 del ntext 

277 

278 text = text.rstrip() 

279 new_len = text.count("\n"), len(text) 

280 if not_first_run and (old_len <= new_len): 

281 break 

282 

283 rcode = str_to_lines(text) 

284 shortest = rcode 

285 old_len = new_len 

286 not_first_run = True 

287 

288 if (len(shortest) <= 0) or (old_len[0] <= 0): 

289 raise ValueError(f"Text cannot become {shortest}.") 

290 

291 return shortest 

292 

293 

294def preprocess_python(code: list[str], 

295 lines: list[int] | None = None, 

296 labels: Iterable[str] | None = None, 

297 args: set[str] | None = None) -> str: 

298 r""" 

299 Preprocess Python code. 

300 

301 First, we select all lines of the code we want to keep. 

302 If labels are defined, then lines can be kept as ranges or as single 

303 lines. 

304 Otherwise, all lines are selected in this step. 

305 

306 Then, if line numbers are provided, we selected the lines based on the 

307 line numbers from the lines we have preserved. 

308 

309 Finally, the Python formatter is applied. 

310 

311 :param code: the code loaded from a file 

312 :param lines: the lines to keep, or `None` if we keep all 

313 :param labels: a list of labels marking start and end of code snippets 

314 to include 

315 :param args: the arguments for the code formatter 

316 :return: the formatted code string 

317 """ 

318 keep_lines = select_lines(code=code, labels=labels, lines=lines) 

319 

320 # set up arguments 

321 strip_docstrings: bool = True 

322 strip_comments: bool = True 

323 strip_hints: bool = True 

324 do_format: bool = True 

325 if args: 

326 do_format = "format" not in args 

327 strip_docstrings = "doc" not in args 

328 strip_comments = "comments" not in args 

329 strip_hints = "hints" not in args 

330 

331 if do_format: 

332 return lines_to_str(format_python(keep_lines, 

333 strip_docstrings=strip_docstrings, 

334 strip_comments=strip_comments, 

335 strip_hints=strip_hints), 

336 trailing_newline=True) 

337 return lines_to_str(keep_lines, trailing_newline=True)