Coverage for bookbuilderpy/format_python.py: 93%
143 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-17 23:15 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-17 23:15 +0000
1"""A formatter for python code."""
2import io
3import sys
4import token
5import tokenize
6from typing import Iterable
8import regex as reg # type: ignore
9import strip_hints as sh # type: ignore
10import yapf # type: ignore
12from bookbuilderpy.source_tools import (
13 format_empty_lines,
14 select_lines,
15 strip_common_whitespace_prefix,
16)
17from bookbuilderpy.strings import lines_to_str, str_to_lines
18from bookbuilderpy.types import type_error
21def __no_empty_after(line: str) -> bool:
22 """
23 No empty line is permitted after definition.
25 :param line: the line
26 :return: a boolean value
27 >>> __no_empty_after("def ")
28 True
29 >>> __no_empty_after("import ")
30 True
31 >>> __no_empty_after("from ")
32 True
33 >>> __no_empty_after("def")
34 False
35 >>> __no_empty_after("import")
36 False
37 >>> __no_empty_after("from")
38 False
39 """
40 return line.startswith(("def ", "import ", "from "))
43def __empty_before(line: str) -> bool:
44 """
45 Check whether an empty line is needed before this one.
47 :param line: the line
48 :return: a boolean value
49 >>> __empty_before("def")
50 False
51 >>> __empty_before("def ")
52 True
53 >>> __empty_before("class")
54 False
55 >>> __empty_before("class ")
56 True
57 """
58 return line.startswith(("def ", "class "))
61def __force_no_empty_after(line: str) -> bool:
62 """
63 Really no empty line is permitted after definition.
65 :param line: the line
66 :return: a boolean value
67 >>> __force_no_empty_after("@")
68 True
69 """
70 return line.startswith("@")
73#: the internal style for formatting Python code
74__YAPF_STYLE = yapf.style.CreatePEP8Style()
75__YAPF_STYLE["ARITHMETIC_PRECEDENCE_INDICATION"] = True
76__YAPF_STYLE["BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION"] = 1
77__YAPF_STYLE["BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF"] = False
78__YAPF_STYLE["COALESCE_BRACKETS"] = True
79__YAPF_STYLE["COLUMN_LIMIT"] = 74
80__YAPF_STYLE["EACH_DICT_ENTRY_ON_SEPARATE_LINE"] = False
81__YAPF_STYLE["SPLIT_BEFORE_NAMED_ASSIGNS"] = False
84def __format_lines(code: str) -> str:
85 r"""
86 Format Python code lines.
88 :param code: the original code
89 :return: the formatted lines.
91 >>> __format_lines("\ndef a():\n return 7- 45\n\n")
92 'def a():\n return 7 - 45'
93 >>> __format_lines("\n\n \nclass b:\n def bb(self): x =3/a()")
94 'class b:\n def bb(self):\n x = 3 / a()'
95 """
96 return yapf.yapf_api.FormatCode(code,
97 style_config=__YAPF_STYLE)[0].rstrip()
100#: the regexes stripping comments that occupy a complete line
101__REGEX_STRIP_LINE_COMMENT: reg.Regex = reg.compile(
102 "\\n[ \\t]*?#.*?\\n",
103 flags=reg.V1 | reg.MULTILINE) # pylint: disable=E1101
106def __strip_hints(code: str,
107 strip_comments: bool = False) -> str:
108 r"""
109 Strip all type hints from the given code string.
111 :param code: the code string
112 :return: the stripped code string
113 >>> __format_lines(__strip_hints(
114 ... "a: int = 7\ndef b(c: int) -> List[int]:\n return [4]"))
115 'a = 7\n\ndef b(c):\n return [4]'
116 """
117 new_text: str = sh.strip_string_to_string(code,
118 strip_nl=True,
119 to_empty=True)
121 # If we have single lines with type hints only, the above will turn
122 # them into line comments. We need to get rid of those.
124 if strip_comments:
125 # In the ideal case, we want to strip all comments anyway.
126 # Then we do not need to bother with anything complex and can
127 # directly use a regular expression getting rid of them.
128 new_text2 = None
129 while new_text2 != new_text:
130 new_text2 = new_text
131 new_text = reg.sub(__REGEX_STRIP_LINE_COMMENT, "\n", new_text)
132 return new_text
134 # If we should preserve normal comments, all we can do is trying to
135 # find these "new" comments in a very pedestrian fashion.
136 orig_lines: list[str] = code.splitlines()
137 new_lines: list[str] = new_text.splitlines()
138 for i in range(min(len(orig_lines), len(new_lines)) - 1, -1, -1):
139 t1: str = orig_lines[i].strip()
140 t2: str = new_lines[i].strip()
141 if t2.startswith("#") and (not t1.startswith("#")) \
142 and t2.endswith(t1):
143 del new_lines[i]
144 return lines_to_str(new_lines, trailing_newline=False)
147def __strip_docstrings_and_comments(code: str,
148 strip_docstrings: bool = True,
149 strip_comments: bool = True) -> str:
150 r"""
151 Remove all docstrings and comments from a string.
153 :param code: the code
154 :param strip_docstrings: should we delete docstrings?
155 :param strip_comments: should we delete comments?
156 :return: the stripped string
158 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, False)
159 'a = 5# bla\n'
160 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, True)
161 'a = 5\n'
162 >>> __strip_docstrings_and_comments('def b():\n \"\"\"bla!\"\"\"', True)
163 'def b():\n '
164 >>> __strip_docstrings_and_comments('# 1\na = 5\n# 2\nb = 6\n')
165 'a = 5\nb = 6\n'
166 """
167 # First, we strip line comments that are hard to catch correctly with
168 # the tokenization approach later.
169 if strip_comments:
170 code2 = None
171 while code2 != code:
172 code2 = code
173 code = reg.sub(__REGEX_STRIP_LINE_COMMENT, "\n", code)
174 del code2
176 # Now we strip the doc strings and remaining comments.
177 prev_toktype: int = token.INDENT
178 last_lineno: int = -1
179 last_col: int = 0
180 eat_newline: int = 0
181 with io.StringIO() as output:
182 with io.StringIO(code) as reader:
183 for toktype, tttext, (slineno, scol), (telineno, ecol), _ in \
184 tokenize.generate_tokens(reader.readline):
185 elineno = telineno
186 ttext = tttext
187 eat_newline -= 1
188 if slineno > last_lineno:
189 last_col = 0
190 if scol > last_col:
191 output.write(" " * (scol - last_col))
192 if (toktype == token.STRING) and \
193 (prev_toktype in (token.INDENT, token.NEWLINE)):
194 if strip_docstrings:
195 ttext = ""
196 eat_newline = 1
197 elif toktype == tokenize.COMMENT:
198 if strip_comments:
199 ttext = ""
200 elif toktype == tokenize.NEWLINE and eat_newline >= 0:
201 ttext = ""
202 elineno += 1
203 output.write(ttext)
204 prev_toktype = toktype
205 last_col = ecol
206 last_lineno = elineno
208 result = output.getvalue()
210 # remove leading newlines
211 while result:
212 if result[0] == "\n":
213 result = result[1:]
214 continue
215 return result
217 raise ValueError(f"code {code} becomes empty after docstring "
218 "and comment stripping!")
221def format_python(code: Iterable[str],
222 strip_docstrings: bool = True,
223 strip_comments: bool = True,
224 strip_hints: bool = True) -> list[str]:
225 """
226 Format a python code fragment.
228 :param code: the code fragment
229 :param strip_docstrings: should we delete docstrings?
230 :param strip_comments: should we delete comments?
231 :param strip_hints: should we delete type hints?
232 :return: the formatted code
233 """
234 if not isinstance(code, Iterable):
235 raise type_error(code, "code", Iterable)
236 if not isinstance(strip_docstrings, bool):
237 raise type_error(strip_docstrings, "strip_docstrings", bool)
238 if not isinstance(strip_comments, bool):
239 raise type_error(strip_comments, "strip_comments", bool)
240 if not isinstance(strip_hints, bool):
241 raise type_error(strip_hints, "strip_hints", bool)
243 old_len: tuple[int, int] = (sys.maxsize, sys.maxsize)
245 shortest: list[str] = list(code)
246 rcode: list[str] = shortest
247 not_first_run: bool = False
248 while True:
249 rcode = strip_common_whitespace_prefix(format_empty_lines(
250 lines=rcode,
251 empty_before=__empty_before,
252 no_empty_after=__no_empty_after,
253 force_no_empty_after=__force_no_empty_after,
254 max_consecutive_empty_lines=1))
255 if len(rcode) <= 0:
256 raise ValueError("Code becomes empty.")
258 text = lines_to_str(rcode)
259 new_len: tuple[int, int] = (text.count("\n"), len(text))
260 if not_first_run and (old_len <= new_len):
261 break
262 shortest = rcode
263 old_len = new_len
265 text = __format_lines(text)
266 ntext = text
267 if strip_docstrings or strip_comments:
268 ntext = __strip_docstrings_and_comments(
269 text, strip_docstrings=strip_docstrings,
270 strip_comments=strip_comments).rstrip()
271 if strip_hints:
272 ntext = __strip_hints(ntext,
273 strip_comments=strip_comments)
274 if ntext != text:
275 text = __format_lines(ntext)
276 del ntext
278 text = text.rstrip()
279 new_len = text.count("\n"), len(text)
280 if not_first_run and (old_len <= new_len):
281 break
283 rcode = str_to_lines(text)
284 shortest = rcode
285 old_len = new_len
286 not_first_run = True
288 if (len(shortest) <= 0) or (old_len[0] <= 0):
289 raise ValueError(f"Text cannot become {shortest}.")
291 return shortest
294def preprocess_python(code: list[str],
295 lines: list[int] | None = None,
296 labels: Iterable[str] | None = None,
297 args: set[str] | None = None) -> str:
298 r"""
299 Preprocess Python code.
301 First, we select all lines of the code we want to keep.
302 If labels are defined, then lines can be kept as ranges or as single
303 lines.
304 Otherwise, all lines are selected in this step.
306 Then, if line numbers are provided, we selected the lines based on the
307 line numbers from the lines we have preserved.
309 Finally, the Python formatter is applied.
311 :param code: the code loaded from a file
312 :param lines: the lines to keep, or `None` if we keep all
313 :param labels: a list of labels marking start and end of code snippets
314 to include
315 :param args: the arguments for the code formatter
316 :return: the formatted code string
317 """
318 keep_lines = select_lines(code=code, labels=labels, lines=lines)
320 # set up arguments
321 strip_docstrings: bool = True
322 strip_comments: bool = True
323 strip_hints: bool = True
324 do_format: bool = True
325 if args:
326 do_format = "format" not in args
327 strip_docstrings = "doc" not in args
328 strip_comments = "comments" not in args
329 strip_hints = "hints" not in args
331 if do_format:
332 return lines_to_str(format_python(keep_lines,
333 strip_docstrings=strip_docstrings,
334 strip_comments=strip_comments,
335 strip_hints=strip_hints),
336 trailing_newline=True)
337 return lines_to_str(keep_lines, trailing_newline=True)