Coverage for texgit / formatters / python.py: 63%
159 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-22 02:50 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-22 02:50 +0000
1"""A formatter for python code."""
2import argparse
3import io
4import sys
5import token
6import tokenize
7from re import MULTILINE, Pattern, sub
8from re import compile as re_compile
9from typing import Final, Iterable
11import strip_hints as sh # type: ignore
12import yapf # type: ignore
13from pycommons.io.arguments import make_argparser, make_epilog
14from pycommons.types import type_error
16from texgit.formatters.source_tools import (
17 format_empty_lines,
18 select_lines,
19 split_labels,
20 split_line_choices,
21 strip_common_whitespace_prefix,
22)
23from texgit.version import __version__
26def __no_empty_after(line: str) -> bool:
27 """
28 No empty line is permitted after definition.
30 :param line: the line
31 :return: a boolean value
32 >>> __no_empty_after("def ")
33 True
34 >>> __no_empty_after("import ")
35 True
36 >>> __no_empty_after("from ")
37 True
38 >>> __no_empty_after("def")
39 False
40 >>> __no_empty_after("import")
41 False
42 >>> __no_empty_after("from")
43 False
44 """
45 return line.startswith(("def ", "import ", "from "))
48def __empty_before(line: str) -> bool:
49 """
50 Check whether an empty line is needed before this one.
52 :param line: the line
53 :return: a boolean value
54 >>> __empty_before("def")
55 False
56 >>> __empty_before("def ")
57 True
58 >>> __empty_before("class")
59 False
60 >>> __empty_before("class ")
61 True
62 """
63 return line.startswith(("def ", "class "))
66def __force_no_empty_after(line: str) -> bool:
67 """
68 Really no empty line is permitted after definition.
70 :param line: the line
71 :return: a boolean value
72 >>> __force_no_empty_after("@")
73 True
74 """
75 return line.startswith("@")
78#: the internal style for formatting Python code
79__YAPF_STYLE = yapf.style.CreatePEP8Style()
80__YAPF_STYLE["ARITHMETIC_PRECEDENCE_INDICATION"] = True
81__YAPF_STYLE["BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION"] = 2
82__YAPF_STYLE["BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF"] = False
83__YAPF_STYLE["COALESCE_BRACKETS"] = True
84__YAPF_STYLE["COLUMN_LIMIT"] = 74
85__YAPF_STYLE["EACH_DICT_ENTRY_ON_SEPARATE_LINE"] = False
86__YAPF_STYLE["SPLIT_BEFORE_NAMED_ASSIGNS"] = False
89def __format_lines(code: str) -> str:
90 r"""
91 Format Python code lines.
93 :param code: the original code
94 :return: the formatted lines.
96 >>> __format_lines("\ndef a():\n return 7- 45\n\n")
97 'def a():\n return 7 - 45'
98 >>> __format_lines("\n\n \nclass b:\n def bb(self): x =3/a()")
99 'class b:\n def bb(self):\n x = 3 / a()'
100 """
101 return str.replace(str.rstrip(yapf.yapf_api.FormatCode(
102 code, style_config=__YAPF_STYLE)[0]), "\n\n\n\n", "\n\n\n")
105#: the regexes stripping comments that occupy a complete line
106__REGEX_STRIP_LINE_COMMENT: Pattern = re_compile(
107 r"\n[ \t]*?#.*?\n", flags=MULTILINE)
110def __strip_hints(
111 code: str, strip_comments: bool = False) -> str:
112 r"""
113 Strip all type hints from the given code string.
115 :param code: the code string
116 :return: the stripped code string
117 >>> __format_lines(__strip_hints(
118 ... "a: int = 7\ndef b(c: int) -> List[int]:\n return [4]"))
119 'a = 7\n\n\ndef b(c):\n return [4]'
120 """
121 new_text: str = sh.strip_string_to_string(
122 code, strip_nl=True, to_empty=True)
124 # If we have single lines with type hints only, the above will turn
125 # them into line comments. We need to get rid of those.
127 if strip_comments:
128 # In the ideal case, we want to strip all comments anyway.
129 # Then we do not need to.read_all_str() bother with anything complex
130 # and can directly use a regular expression getting rid of them.
131 new_text2 = None
132 while new_text2 != new_text:
133 new_text2 = new_text
134 new_text = sub(__REGEX_STRIP_LINE_COMMENT, "\n", new_text)
135 return new_text
137 # If we should preserve normal comments, all we can do is trying to
138 # find these "new" comments in a very pedestrian fashion.
139 orig_lines: list[str] = code.splitlines()
140 new_lines: list[str] = new_text.splitlines()
141 for i in range(min(len(orig_lines), len(new_lines)) - 1, -1, -1):
142 t1: str = orig_lines[i].strip()
143 t2: str = new_lines[i].strip()
144 if t2.startswith("#") and (not t1.startswith("#")) \
145 and t2.endswith(t1):
146 del new_lines[i]
147 return "\n".join(map(str.rstrip, new_lines))
150def __strip_docstrings_and_comments(code: str,
151 strip_docstrings: bool = True,
152 strip_comments: bool = True) -> str:
153 r"""
154 Remove all docstrings and comments from a string.
156 :param code: the code
157 :param strip_docstrings: should we delete docstrings?
158 :param strip_comments: should we delete comments?
159 :return: the stripped string
161 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, False)
162 'a = 5# bla\n'
163 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, True)
164 'a = 5\n'
165 >>> __strip_docstrings_and_comments('def b():\n \"\"\"bla!\"\"\"', True)
166 'def b():\n '
167 >>> __strip_docstrings_and_comments('# 1\na = 5\n# 2\nb = 6\n')
168 'a = 5\nb = 6\n'
169 """
170 # First, we strip line comments that are hard to catch correctly with
171 # the tokenization approach later.
172 if strip_comments:
173 code2 = None
174 while code2 != code:
175 code2 = code
176 code = sub(__REGEX_STRIP_LINE_COMMENT, "\n", code)
177 del code2
179 # Now we strip the doc strings and remaining comments.
180 prev_toktype: int = token.INDENT
181 last_lineno: int = -1
182 last_col: int = 0
183 eat_newline: int = 0
184 with io.StringIO() as output:
185 with io.StringIO(code) as reader:
186 for toktype, tttext, (slineno, scol), (telineno, ecol), _ in \
187 tokenize.generate_tokens(reader.readline):
188 elineno = telineno
189 ttext = tttext
190 eat_newline -= 1
191 if slineno > last_lineno:
192 last_col = 0
193 if scol > last_col:
194 output.write(" " * (scol - last_col))
195 if (toktype == token.STRING) and \
196 (prev_toktype in {token.INDENT, token.NEWLINE}):
197 if strip_docstrings:
198 ttext = ""
199 eat_newline = 1
200 elif toktype == tokenize.COMMENT:
201 if strip_comments:
202 ttext = ""
203 elif toktype == tokenize.NEWLINE and eat_newline >= 0:
204 ttext = ""
205 elineno += 1
206 output.write(ttext)
207 prev_toktype = toktype
208 last_col = ecol
209 last_lineno = elineno
211 result = output.getvalue()
213 # remove leading newlines
214 while result:
215 if result[0] == "\n":
216 result = result[1:]
217 continue
218 return result
220 raise ValueError(f"code {code} becomes empty after docstring "
221 "and comment stripping!")
224def format_python(code: Iterable[str],
225 strip_docstrings: bool = True,
226 strip_comments: bool = True,
227 strip_hints: bool = True) -> list[str]:
228 """
229 Format a python code fragment.
231 :param code: the code fragment
232 :param strip_docstrings: should we delete docstrings?
233 :param strip_comments: should we delete comments?
234 :param strip_hints: should we delete type hints?
235 :return: the formatted code
236 """
237 if not isinstance(code, Iterable):
238 raise type_error(code, "code", Iterable)
239 if not isinstance(strip_docstrings, bool):
240 raise type_error(strip_docstrings, "strip_docstrings", bool)
241 if not isinstance(strip_comments, bool):
242 raise type_error(strip_comments, "strip_comments", bool)
243 if not isinstance(strip_hints, bool):
244 raise type_error(strip_hints, "strip_hints", bool)
246 old_len: tuple[int, int] = (sys.maxsize, sys.maxsize)
248 shortest: list[str] = list(code)
249 rcode: list[str] = shortest
250 not_first_run: bool = False
251 while True:
252 rcode = strip_common_whitespace_prefix(format_empty_lines(
253 lines=rcode,
254 empty_before=__empty_before,
255 no_empty_after=__no_empty_after,
256 force_no_empty_after=__force_no_empty_after,
257 max_consecutive_empty_lines=2))
258 if len(rcode) <= 0:
259 raise ValueError("Code becomes empty.")
261 text = "\n".join(map(str.rstrip, rcode))
262 new_len: tuple[int, int] = (text.count("\n"), len(text))
263 if not_first_run and (old_len <= new_len):
264 break
265 shortest = rcode
266 old_len = new_len
268 text = __format_lines(text)
269 ntext = text
270 if strip_docstrings or strip_comments:
271 ntext = __strip_docstrings_and_comments(
272 text, strip_docstrings=strip_docstrings,
273 strip_comments=strip_comments).rstrip()
274 if strip_hints:
275 ntext = __strip_hints(ntext,
276 strip_comments=strip_comments)
277 if ntext != text:
278 text = __format_lines(ntext)
279 del ntext
281 text = text.rstrip()
282 new_len = text.count("\n"), len(text)
283 if not_first_run and (old_len <= new_len):
284 break
286 rcode = list(map(str.rstrip, text.splitlines()))
287 shortest = rcode
288 old_len = new_len
289 not_first_run = True
291 if (len(shortest) <= 0) or (old_len[0] <= 0):
292 raise ValueError(f"Text cannot become {shortest}.")
294 return shortest
297def preprocess_python(code: list[str],
298 lines: list[int] | None = None,
299 labels: Iterable[str] | None = None,
300 params: set[str] | None = None) -> str:
301 r"""
302 Preprocess Python code.
304 First, we select all lines of the code we want to keep.
305 If labels are defined, then lines can be kept as ranges or as single
306 lines.
307 Otherwise, all lines are selected in this step.
309 Then, if line numbers are provided, we selected the lines based on the
310 line numbers from the lines we have preserved.
312 Finally, the Python formatter is applied.
314 :param code: the code loaded from a file
315 :param lines: the lines to keep, or `None` if we keep all
316 :param labels: a list of labels marking start and end of code snippets
317 to include
318 :param params: the arguments for the code formatter
319 :return: the formatted code string
320 """
321 keep_lines = select_lines(code=code, labels=labels, lines=lines,
322 max_consecutive_empty_lines=2)
324 # set up arguments
325 strip_docstrings: bool = True
326 strip_comments: bool = True
327 strip_hintx: bool = True
328 do_format: bool = True
329 if params is not None:
330 do_format = "format" not in params
331 strip_docstrings = "doc" not in params
332 strip_comments = "comments" not in params
333 strip_hintx = "hints" not in params
335 if do_format:
336 keep_lines = format_python(keep_lines,
337 strip_docstrings=strip_docstrings,
338 strip_comments=strip_comments,
339 strip_hints=strip_hintx)
341 while (len(keep_lines) > 0) and (not keep_lines[-1]):
342 del keep_lines[-1]
344 if (len(keep_lines) == 0) or keep_lines[-1]:
345 keep_lines.append("")
346 return "\n".join(map(str.rstrip, keep_lines))
349# Execute the formatter as script
350if __name__ == "__main__":
351 parser: Final[argparse.ArgumentParser] = make_argparser(
352 __file__, "Execute the Python Formatter.",
353 make_epilog(
354 "Format Python code received via stdin, write it to stdout.",
355 2023, None, "Thomas Weise",
356 url="https://thomasweise.github.io/texgit_py",
357 email="tweise@hfuu.edu.cn, tweise@ustc.edu.cn"),
358 __version__)
359 parser.add_argument(
360 "--lines", help="a comma-separated list of selected lines",
361 type=str, default="", nargs="?")
362 parser.add_argument(
363 "--labels", help="a comma-separated list of labels",
364 type=str, default="", nargs="?")
365 parser.add_argument(
366 "--args", help="a comma-separated list of arguments: "
367 "'format' to keep the whole format, "
368 "'doc' means keep the documentation, "
369 "'hints' means keep type hints, "
370 "'comments' means keep comments ",
371 type=str, default="", nargs="?")
372 args: Final[argparse.Namespace] = parser.parse_args()
373 input_lines: Final[list[str]] = sys.stdin.readlines()
374 sys.stdout.write(preprocess_python(
375 input_lines,
376 split_line_choices(args.lines),
377 split_labels(args.labels),
378 split_labels(args.args)))
379 sys.stdout.flush()