Coverage for texgit/formatters/python.py: 63%

1"""A formatter for python code."""

2import argparse

3import io

4import sys

5import token

6import tokenize

7from re import MULTILINE, Pattern, sub

8from re import compile as re_compile

9from typing import Final, Iterable

11import strip_hints as sh # type: ignore

12import yapf # type: ignore

13from pycommons.io.arguments import make_argparser, make_epilog

14from pycommons.types import type_error

16from texgit.formatters.source_tools import (

17 format_empty_lines,

18 select_lines,

19 split_labels,

20 split_line_choices,

21 strip_common_whitespace_prefix,

22)

23from texgit.version import __version__

26def __no_empty_after(line: str) -> bool:

27 """

28 No empty line is permitted after definition.

30 :param line: the line

31 :return: a boolean value

32 >>> __no_empty_after("def ")

33 True

34 >>> __no_empty_after("import ")

35 True

36 >>> __no_empty_after("from ")

37 True

38 >>> __no_empty_after("def")

39 False

40 >>> __no_empty_after("import")

41 False

42 >>> __no_empty_after("from")

43 False

44 """

45 return line.startswith(("def ", "import ", "from "))

48def __empty_before(line: str) -> bool:

49 """

50 Check whether an empty line is needed before this one.

52 :param line: the line

53 :return: a boolean value

54 >>> __empty_before("def")

55 False

56 >>> __empty_before("def ")

57 True

58 >>> __empty_before("class")

59 False

60 >>> __empty_before("class ")

61 True

62 """

63 return line.startswith(("def ", "class "))

66def __force_no_empty_after(line: str) -> bool:

67 """

68 Really no empty line is permitted after definition.

70 :param line: the line

71 :return: a boolean value

72 >>> __force_no_empty_after("@")

73 True

74 """

75 return line.startswith("@")

78#: the internal style for formatting Python code

79__YAPF_STYLE = yapf.style.CreatePEP8Style()

80__YAPF_STYLE["ARITHMETIC_PRECEDENCE_INDICATION"] = True

81__YAPF_STYLE["BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION"] = 2

82__YAPF_STYLE["BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF"] = False

83__YAPF_STYLE["COALESCE_BRACKETS"] = True

84__YAPF_STYLE["COLUMN_LIMIT"] = 74

85__YAPF_STYLE["EACH_DICT_ENTRY_ON_SEPARATE_LINE"] = False

86__YAPF_STYLE["SPLIT_BEFORE_NAMED_ASSIGNS"] = False

89def __format_lines(code: str) -> str:

90 r"""

91 Format Python code lines.

93 :param code: the original code

94 :return: the formatted lines.

96 >>> __format_lines("\ndef a():\n return 7- 45\n\n")

97 'def a():\n return 7 - 45'

98 >>> __format_lines("\n\n \nclass b:\n def bb(self): x =3/a()")

99 'class b:\n def bb(self):\n x = 3 / a()'

100 """

101 return str.replace(str.rstrip(yapf.yapf_api.FormatCode(

102 code, style_config=__YAPF_STYLE)[0]), "\n\n\n\n", "\n\n\n")

103

104

105#: the regexes stripping comments that occupy a complete line

106__REGEX_STRIP_LINE_COMMENT: Pattern = re_compile(

107 r"\n[ \t]*?#.*?\n", flags=MULTILINE)

108

109

110def __strip_hints(

111 code: str, strip_comments: bool = False) -> str:

112 r"""

113 Strip all type hints from the given code string.

114

115 :param code: the code string

116 :return: the stripped code string

117 >>> __format_lines(__strip_hints(

118 ... "a: int = 7\ndef b(c: int) -> List[int]:\n return [4]"))

119 'a = 7\n\n\ndef b(c):\n return [4]'

120 """

121 new_text: str = sh.strip_string_to_string(

122 code, strip_nl=True, to_empty=True)

123

124 # If we have single lines with type hints only, the above will turn

125 # them into line comments. We need to get rid of those.

126

127 if strip_comments:

128 # In the ideal case, we want to strip all comments anyway.

129 # Then we do not need to.read_all_str() bother with anything complex

130 # and can directly use a regular expression getting rid of them.

131 new_text2 = None

132 while new_text2 != new_text:

133 new_text2 = new_text

134 new_text = sub(__REGEX_STRIP_LINE_COMMENT, "\n", new_text)

135 return new_text

136

137 # If we should preserve normal comments, all we can do is trying to

138 # find these "new" comments in a very pedestrian fashion.

139 orig_lines: list[str] = code.splitlines()

140 new_lines: list[str] = new_text.splitlines()

141 for i in range(min(len(orig_lines), len(new_lines)) - 1, -1, -1):

142 t1: str = orig_lines[i].strip()

143 t2: str = new_lines[i].strip()

144 if t2.startswith("#") and (not t1.startswith("#")) \

145 and t2.endswith(t1):

146 del new_lines[i]

147 return "\n".join(map(str.rstrip, new_lines))

148

149

150def __strip_docstrings_and_comments(code: str,

151 strip_docstrings: bool = True,

152 strip_comments: bool = True) -> str:

153 r"""

154 Remove all docstrings and comments from a string.

155

156 :param code: the code

157 :param strip_docstrings: should we delete docstrings?

158 :param strip_comments: should we delete comments?

159 :return: the stripped string

160

161 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, False)

162 'a = 5# bla\n'

163 >>> __strip_docstrings_and_comments("a = 5# bla\n", False, True)

164 'a = 5\n'

165 >>> __strip_docstrings_and_comments('def b():\n \"\"\"bla!\"\"\"', True)

166 'def b():\n '

167 >>> __strip_docstrings_and_comments('# 1\na = 5\n# 2\nb = 6\n')

168 'a = 5\nb = 6\n'

169 """

170 # First, we strip line comments that are hard to catch correctly with

171 # the tokenization approach later.

172 if strip_comments:

173 code2 = None

174 while code2 != code:

175 code2 = code

176 code = sub(__REGEX_STRIP_LINE_COMMENT, "\n", code)

177 del code2

178

179 # Now we strip the doc strings and remaining comments.

180 prev_toktype: int = token.INDENT

181 last_lineno: int = -1

182 last_col: int = 0

183 eat_newline: int = 0

184 with io.StringIO() as output:

185 with io.StringIO(code) as reader:

186 for toktype, tttext, (slineno, scol), (telineno, ecol), _ in \

187 tokenize.generate_tokens(reader.readline):

188 elineno = telineno

189 ttext = tttext

190 eat_newline -= 1

191 if slineno > last_lineno:

192 last_col = 0

193 if scol > last_col:

194 output.write(" " * (scol - last_col))

195 if (toktype == token.STRING) and \

196 (prev_toktype in {token.INDENT, token.NEWLINE}):

197 if strip_docstrings:

198 ttext = ""

199 eat_newline = 1

200 elif toktype == tokenize.COMMENT:

201 if strip_comments:

202 ttext = ""

203 elif toktype == tokenize.NEWLINE and eat_newline >= 0:

204 ttext = ""

205 elineno += 1

206 output.write(ttext)

207 prev_toktype = toktype

208 last_col = ecol

209 last_lineno = elineno

210

211 result = output.getvalue()

212

213 # remove leading newlines

214 while result:

215 if result[0] == "\n":

216 result = result[1:]

217 continue

218 return result

219

220 raise ValueError(f"code {code} becomes empty after docstring "

221 "and comment stripping!")

222

223

224def format_python(code: Iterable[str],

225 strip_docstrings: bool = True,

226 strip_comments: bool = True,

227 strip_hints: bool = True) -> list[str]:

228 """

229 Format a python code fragment.

230

231 :param code: the code fragment

232 :param strip_docstrings: should we delete docstrings?

233 :param strip_comments: should we delete comments?

234 :param strip_hints: should we delete type hints?

235 :return: the formatted code

236 """

237 if not isinstance(code, Iterable):

238 raise type_error(code, "code", Iterable)

239 if not isinstance(strip_docstrings, bool):

240 raise type_error(strip_docstrings, "strip_docstrings", bool)

241 if not isinstance(strip_comments, bool):

242 raise type_error(strip_comments, "strip_comments", bool)

243 if not isinstance(strip_hints, bool):

244 raise type_error(strip_hints, "strip_hints", bool)

245

246 old_len: tuple[int, int] = (sys.maxsize, sys.maxsize)

247

248 shortest: list[str] = list(code)

249 rcode: list[str] = shortest

250 not_first_run: bool = False

251 while True:

252 rcode = strip_common_whitespace_prefix(format_empty_lines(

253 lines=rcode,

254 empty_before=__empty_before,

255 no_empty_after=__no_empty_after,

256 force_no_empty_after=__force_no_empty_after,

257 max_consecutive_empty_lines=2))

258 if len(rcode) <= 0:

259 raise ValueError("Code becomes empty.")

260

261 text = "\n".join(map(str.rstrip, rcode))

262 new_len: tuple[int, int] = (text.count("\n"), len(text))

263 if not_first_run and (old_len <= new_len):

264 break

265 shortest = rcode

266 old_len = new_len

267

268 text = __format_lines(text)

269 ntext = text

270 if strip_docstrings or strip_comments:

271 ntext = __strip_docstrings_and_comments(

272 text, strip_docstrings=strip_docstrings,

273 strip_comments=strip_comments).rstrip()

274 if strip_hints:

275 ntext = __strip_hints(ntext,

276 strip_comments=strip_comments)

277 if ntext != text:

278 text = __format_lines(ntext)

279 del ntext

280

281 text = text.rstrip()

282 new_len = text.count("\n"), len(text)

283 if not_first_run and (old_len <= new_len):

284 break

285

286 rcode = list(map(str.rstrip, text.splitlines()))

287 shortest = rcode

288 old_len = new_len

289 not_first_run = True

290

291 if (len(shortest) <= 0) or (old_len[0] <= 0):

292 raise ValueError(f"Text cannot become {shortest}.")

293

294 return shortest

295

296

297def preprocess_python(code: list[str],

298 lines: list[int] | None = None,

299 labels: Iterable[str] | None = None,

300 params: set[str] | None = None) -> str:

301 r"""

302 Preprocess Python code.

303

304 First, we select all lines of the code we want to keep.

305 If labels are defined, then lines can be kept as ranges or as single

306 lines.

307 Otherwise, all lines are selected in this step.

308

309 Then, if line numbers are provided, we selected the lines based on the

310 line numbers from the lines we have preserved.

311

312 Finally, the Python formatter is applied.

313

314 :param code: the code loaded from a file

315 :param lines: the lines to keep, or `None` if we keep all

316 :param labels: a list of labels marking start and end of code snippets

317 to include

318 :param params: the arguments for the code formatter

319 :return: the formatted code string

320 """

321 keep_lines = select_lines(code=code, labels=labels, lines=lines,

322 max_consecutive_empty_lines=2)

323

324 # set up arguments

325 strip_docstrings: bool = True

326 strip_comments: bool = True

327 strip_hintx: bool = True

328 do_format: bool = True

329 if params is not None:

330 do_format = "format" not in params

331 strip_docstrings = "doc" not in params

332 strip_comments = "comments" not in params

333 strip_hintx = "hints" not in params

334

335 if do_format:

336 keep_lines = format_python(keep_lines,

337 strip_docstrings=strip_docstrings,

338 strip_comments=strip_comments,

339 strip_hints=strip_hintx)

340

341 while (len(keep_lines) > 0) and (not keep_lines[-1]):

342 del keep_lines[-1]

343

344 if (len(keep_lines) == 0) or keep_lines[-1]:

345 keep_lines.append("")

346 return "\n".join(map(str.rstrip, keep_lines))

347

348

349# Execute the formatter as script

350if __name__ == "__main__":

351 parser: Final[argparse.ArgumentParser] = make_argparser(

352 __file__, "Execute the Python Formatter.",

353 make_epilog(

354 "Format Python code received via stdin, write it to stdout.",

355 2023, None, "Thomas Weise",

356 url="https://thomasweise.github.io/texgit_py",

357 email="tweise@hfuu.edu.cn, tweise@ustc.edu.cn"),

358 __version__)

359 parser.add_argument(

360 "--lines", help="a comma-separated list of selected lines",

361 type=str, default="", nargs="?")

362 parser.add_argument(

363 "--labels", help="a comma-separated list of labels",

364 type=str, default="", nargs="?")

365 parser.add_argument(

366 "--args", help="a comma-separated list of arguments: "

367 "'format' to keep the whole format, "

368 "'doc' means keep the documentation, "

369 "'hints' means keep type hints, "

370 "'comments' means keep comments ",

371 type=str, default="", nargs="?")

372 args: Final[argparse.Namespace] = parser.parse_args()

373 input_lines: Final[list[str]] = sys.stdin.readlines()

374 sys.stdout.write(preprocess_python(

375 input_lines,

376 split_line_choices(args.lines),

377 split_labels(args.labels),

378 split_labels(args.args)))

379 sys.stdout.flush()

Coverage for texgit / formatters / python.py: 63%

159 statements