Coverage for texgit / formatters / python.py: 63%

159 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-22 02:50 +0000

1"""A formatter for python code.""" 

2import argparse 

3import io 

4import sys 

5import token 

6import tokenize 

7from re import MULTILINE, Pattern, sub 

8from re import compile as re_compile 

9from typing import Final, Iterable 

10 

11import strip_hints as sh # type: ignore 

12import yapf # type: ignore 

13from pycommons.io.arguments import make_argparser, make_epilog 

14from pycommons.types import type_error 

15 

16from texgit.formatters.source_tools import ( 

17 format_empty_lines, 

18 select_lines, 

19 split_labels, 

20 split_line_choices, 

21 strip_common_whitespace_prefix, 

22) 

23from texgit.version import __version__ 

24 

25 

26def __no_empty_after(line: str) -> bool: 

27 """ 

28 No empty line is permitted after definition. 

29 

30 :param line: the line 

31 :return: a boolean value 

32 >>> __no_empty_after("def ") 

33 True 

34 >>> __no_empty_after("import ") 

35 True 

36 >>> __no_empty_after("from ") 

37 True 

38 >>> __no_empty_after("def") 

39 False 

40 >>> __no_empty_after("import") 

41 False 

42 >>> __no_empty_after("from") 

43 False 

44 """ 

45 return line.startswith(("def ", "import ", "from ")) 

46 

47 

48def __empty_before(line: str) -> bool: 

49 """ 

50 Check whether an empty line is needed before this one. 

51 

52 :param line: the line 

53 :return: a boolean value 

54 >>> __empty_before("def") 

55 False 

56 >>> __empty_before("def ") 

57 True 

58 >>> __empty_before("class") 

59 False 

60 >>> __empty_before("class ") 

61 True 

62 """ 

63 return line.startswith(("def ", "class ")) 

64 

65 

66def __force_no_empty_after(line: str) -> bool: 

67 """ 

68 Really no empty line is permitted after definition. 

69 

70 :param line: the line 

71 :return: a boolean value 

72 >>> __force_no_empty_after("@") 

73 True 

74 """ 

75 return line.startswith("@") 

76 

77 

78#: the internal style for formatting Python code 

79__YAPF_STYLE = yapf.style.CreatePEP8Style() 

80__YAPF_STYLE["ARITHMETIC_PRECEDENCE_INDICATION"] = True 

81__YAPF_STYLE["BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION"] = 2 

82__YAPF_STYLE["BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF"] = False 

83__YAPF_STYLE["COALESCE_BRACKETS"] = True 

84__YAPF_STYLE["COLUMN_LIMIT"] = 74 

85__YAPF_STYLE["EACH_DICT_ENTRY_ON_SEPARATE_LINE"] = False 

86__YAPF_STYLE["SPLIT_BEFORE_NAMED_ASSIGNS"] = False 

87 

88 

89def __format_lines(code: str) -> str: 

90 r""" 

91 Format Python code lines. 

92 

93 :param code: the original code 

94 :return: the formatted lines. 

95 

96 >>> __format_lines("\ndef a():\n return 7- 45\n\n") 

97 'def a():\n return 7 - 45' 

98 >>> __format_lines("\n\n \nclass b:\n def bb(self): x =3/a()") 

99 'class b:\n def bb(self):\n x = 3 / a()' 

100 """ 

101 return str.replace(str.rstrip(yapf.yapf_api.FormatCode( 

102 code, style_config=__YAPF_STYLE)[0]), "\n\n\n\n", "\n\n\n") 

103 

104 

105#: the regexes stripping comments that occupy a complete line 

106__REGEX_STRIP_LINE_COMMENT: Pattern = re_compile( 

107 r"\n[ \t]*?#.*?\n", flags=MULTILINE) 

108 

109 

110def __strip_hints( 

111 code: str, strip_comments: bool = False) -> str: 

112 r""" 

113 Strip all type hints from the given code string. 

114 

115 :param code: the code string 

116 :return: the stripped code string 

117 >>> __format_lines(__strip_hints( 

118 ... "a: int = 7\ndef b(c: int) -> List[int]:\n return [4]")) 

119 'a = 7\n\n\ndef b(c):\n return [4]' 

120 """ 

121 new_text: str = sh.strip_string_to_string( 

122 code, strip_nl=True, to_empty=True) 

123 

124 # If we have single lines with type hints only, the above will turn 

125 # them into line comments. We need to get rid of those. 

126 

127 if strip_comments: 

128 # In the ideal case, we want to strip all comments anyway. 

129 # Then we do not need to.read_all_str() bother with anything complex 

130 # and can directly use a regular expression getting rid of them. 

131 new_text2 = None 

132 while new_text2 != new_text: 

133 new_text2 = new_text 

134 new_text = sub(__REGEX_STRIP_LINE_COMMENT, "\n", new_text) 

135 return new_text 

136 

137 # If we should preserve normal comments, all we can do is trying to 

138 # find these "new" comments in a very pedestrian fashion. 

139 orig_lines: list[str] = code.splitlines() 

140 new_lines: list[str] = new_text.splitlines() 

141 for i in range(min(len(orig_lines), len(new_lines)) - 1, -1, -1): 

142 t1: str = orig_lines[i].strip() 

143 t2: str = new_lines[i].strip() 

144 if t2.startswith("#") and (not t1.startswith("#")) \ 

145 and t2.endswith(t1): 

146 del new_lines[i] 

147 return "\n".join(map(str.rstrip, new_lines)) 

148 

149 

150def __strip_docstrings_and_comments(code: str, 

151 strip_docstrings: bool = True, 

152 strip_comments: bool = True) -> str: 

153 r""" 

154 Remove all docstrings and comments from a string. 

155 

156 :param code: the code 

157 :param strip_docstrings: should we delete docstrings? 

158 :param strip_comments: should we delete comments? 

159 :return: the stripped string 

160 

161 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, False) 

162 'a = 5# bla\n' 

163 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, True) 

164 'a = 5\n' 

165 >>> __strip_docstrings_and_comments('def b():\n \"\"\"bla!\"\"\"', True) 

166 'def b():\n ' 

167 >>> __strip_docstrings_and_comments('# 1\na = 5\n# 2\nb = 6\n') 

168 'a = 5\nb = 6\n' 

169 """ 

170 # First, we strip line comments that are hard to catch correctly with 

171 # the tokenization approach later. 

172 if strip_comments: 

173 code2 = None 

174 while code2 != code: 

175 code2 = code 

176 code = sub(__REGEX_STRIP_LINE_COMMENT, "\n", code) 

177 del code2 

178 

179 # Now we strip the doc strings and remaining comments. 

180 prev_toktype: int = token.INDENT 

181 last_lineno: int = -1 

182 last_col: int = 0 

183 eat_newline: int = 0 

184 with io.StringIO() as output: 

185 with io.StringIO(code) as reader: 

186 for toktype, tttext, (slineno, scol), (telineno, ecol), _ in \ 

187 tokenize.generate_tokens(reader.readline): 

188 elineno = telineno 

189 ttext = tttext 

190 eat_newline -= 1 

191 if slineno > last_lineno: 

192 last_col = 0 

193 if scol > last_col: 

194 output.write(" " * (scol - last_col)) 

195 if (toktype == token.STRING) and \ 

196 (prev_toktype in {token.INDENT, token.NEWLINE}): 

197 if strip_docstrings: 

198 ttext = "" 

199 eat_newline = 1 

200 elif toktype == tokenize.COMMENT: 

201 if strip_comments: 

202 ttext = "" 

203 elif toktype == tokenize.NEWLINE and eat_newline >= 0: 

204 ttext = "" 

205 elineno += 1 

206 output.write(ttext) 

207 prev_toktype = toktype 

208 last_col = ecol 

209 last_lineno = elineno 

210 

211 result = output.getvalue() 

212 

213 # remove leading newlines 

214 while result: 

215 if result[0] == "\n": 

216 result = result[1:] 

217 continue 

218 return result 

219 

220 raise ValueError(f"code {code} becomes empty after docstring " 

221 "and comment stripping!") 

222 

223 

224def format_python(code: Iterable[str], 

225 strip_docstrings: bool = True, 

226 strip_comments: bool = True, 

227 strip_hints: bool = True) -> list[str]: 

228 """ 

229 Format a python code fragment. 

230 

231 :param code: the code fragment 

232 :param strip_docstrings: should we delete docstrings? 

233 :param strip_comments: should we delete comments? 

234 :param strip_hints: should we delete type hints? 

235 :return: the formatted code 

236 """ 

237 if not isinstance(code, Iterable): 

238 raise type_error(code, "code", Iterable) 

239 if not isinstance(strip_docstrings, bool): 

240 raise type_error(strip_docstrings, "strip_docstrings", bool) 

241 if not isinstance(strip_comments, bool): 

242 raise type_error(strip_comments, "strip_comments", bool) 

243 if not isinstance(strip_hints, bool): 

244 raise type_error(strip_hints, "strip_hints", bool) 

245 

246 old_len: tuple[int, int] = (sys.maxsize, sys.maxsize) 

247 

248 shortest: list[str] = list(code) 

249 rcode: list[str] = shortest 

250 not_first_run: bool = False 

251 while True: 

252 rcode = strip_common_whitespace_prefix(format_empty_lines( 

253 lines=rcode, 

254 empty_before=__empty_before, 

255 no_empty_after=__no_empty_after, 

256 force_no_empty_after=__force_no_empty_after, 

257 max_consecutive_empty_lines=2)) 

258 if len(rcode) <= 0: 

259 raise ValueError("Code becomes empty.") 

260 

261 text = "\n".join(map(str.rstrip, rcode)) 

262 new_len: tuple[int, int] = (text.count("\n"), len(text)) 

263 if not_first_run and (old_len <= new_len): 

264 break 

265 shortest = rcode 

266 old_len = new_len 

267 

268 text = __format_lines(text) 

269 ntext = text 

270 if strip_docstrings or strip_comments: 

271 ntext = __strip_docstrings_and_comments( 

272 text, strip_docstrings=strip_docstrings, 

273 strip_comments=strip_comments).rstrip() 

274 if strip_hints: 

275 ntext = __strip_hints(ntext, 

276 strip_comments=strip_comments) 

277 if ntext != text: 

278 text = __format_lines(ntext) 

279 del ntext 

280 

281 text = text.rstrip() 

282 new_len = text.count("\n"), len(text) 

283 if not_first_run and (old_len <= new_len): 

284 break 

285 

286 rcode = list(map(str.rstrip, text.splitlines())) 

287 shortest = rcode 

288 old_len = new_len 

289 not_first_run = True 

290 

291 if (len(shortest) <= 0) or (old_len[0] <= 0): 

292 raise ValueError(f"Text cannot become {shortest}.") 

293 

294 return shortest 

295 

296 

297def preprocess_python(code: list[str], 

298 lines: list[int] | None = None, 

299 labels: Iterable[str] | None = None, 

300 params: set[str] | None = None) -> str: 

301 r""" 

302 Preprocess Python code. 

303 

304 First, we select all lines of the code we want to keep. 

305 If labels are defined, then lines can be kept as ranges or as single 

306 lines. 

307 Otherwise, all lines are selected in this step. 

308 

309 Then, if line numbers are provided, we selected the lines based on the 

310 line numbers from the lines we have preserved. 

311 

312 Finally, the Python formatter is applied. 

313 

314 :param code: the code loaded from a file 

315 :param lines: the lines to keep, or `None` if we keep all 

316 :param labels: a list of labels marking start and end of code snippets 

317 to include 

318 :param params: the arguments for the code formatter 

319 :return: the formatted code string 

320 """ 

321 keep_lines = select_lines(code=code, labels=labels, lines=lines, 

322 max_consecutive_empty_lines=2) 

323 

324 # set up arguments 

325 strip_docstrings: bool = True 

326 strip_comments: bool = True 

327 strip_hintx: bool = True 

328 do_format: bool = True 

329 if params is not None: 

330 do_format = "format" not in params 

331 strip_docstrings = "doc" not in params 

332 strip_comments = "comments" not in params 

333 strip_hintx = "hints" not in params 

334 

335 if do_format: 

336 keep_lines = format_python(keep_lines, 

337 strip_docstrings=strip_docstrings, 

338 strip_comments=strip_comments, 

339 strip_hints=strip_hintx) 

340 

341 while (len(keep_lines) > 0) and (not keep_lines[-1]): 

342 del keep_lines[-1] 

343 

344 if (len(keep_lines) == 0) or keep_lines[-1]: 

345 keep_lines.append("") 

346 return "\n".join(map(str.rstrip, keep_lines)) 

347 

348 

349# Execute the formatter as script 

350if __name__ == "__main__": 

351 parser: Final[argparse.ArgumentParser] = make_argparser( 

352 __file__, "Execute the Python Formatter.", 

353 make_epilog( 

354 "Format Python code received via stdin, write it to stdout.", 

355 2023, None, "Thomas Weise", 

356 url="https://thomasweise.github.io/texgit_py", 

357 email="tweise@hfuu.edu.cn, tweise@ustc.edu.cn"), 

358 __version__) 

359 parser.add_argument( 

360 "--lines", help="a comma-separated list of selected lines", 

361 type=str, default="", nargs="?") 

362 parser.add_argument( 

363 "--labels", help="a comma-separated list of labels", 

364 type=str, default="", nargs="?") 

365 parser.add_argument( 

366 "--args", help="a comma-separated list of arguments: " 

367 "'format' to keep the whole format, " 

368 "'doc' means keep the documentation, " 

369 "'hints' means keep type hints, " 

370 "'comments' means keep comments ", 

371 type=str, default="", nargs="?") 

372 args: Final[argparse.Namespace] = parser.parse_args() 

373 input_lines: Final[list[str]] = sys.stdin.readlines() 

374 sys.stdout.write(preprocess_python( 

375 input_lines, 

376 split_line_choices(args.lines), 

377 split_labels(args.labels), 

378 split_labels(args.args))) 

379 sys.stdout.flush()