Coverage for pycommons / io / parser.py: 100%

65 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-06-03 00:58 +0000

1""" 

2A tool for recursively parsing data from directories. 

3 

4This module provides a unified API for parsing data from files in 

5directories. The goal is to offer a way to return a generator that 

6allows us to iterate over the data loaded. While we iterate over this 

7data, the generator internally iterates over the files. 

8 

9This means that the control of how the data is loaded stays with the user, 

10while the programmer can implement the necessary methods to load and process 

11data in a natural way. 

12""" 

13 

14from typing import Final, Generator, Iterable, TypeVar 

15 

16from pycommons.io.console import logger 

17from pycommons.io.path import Path, directory_path, file_path 

18from pycommons.types import type_error 

19 

20#: the type variable for data to be read from the directories 

21T = TypeVar("T") 

22 

23 

24class Parser[T]: 

25 """ 

26 The parser class. 

27 

28 This class allows you to implement convenient parsing routines that can 

29 hierarchically process nested directories of files and return a stream, 

30 i.e., a :class:`Generator` of results. In other words, it flattens the 

31 hierarchical processing of directories into a linear sequence of data. 

32 This allows the user of the API stay in control of when the data is loaded 

33 while the programmer of the parser API can work in a convenient way with 

34 high-level abstractions. Another advantage of this parsing API is that its 

35 results can be processed like a stream and be piped into some filters, 

36 processors, or even output destinations while it is loaded from the files. 

37 For example, we can extract certain elements of data from huge collections 

38 of files and while they are loaded, they could already be processed and 

39 stored to a stream of CSV data. 

40 

41 The method :meth:`~pycommons.io.parser.Parser.parse` can be applied to any 

42 path to a file or directory and will hierarchically process the path and 

43 yield the parsing results one by one. This is the normal entry point 

44 function for this parsing API. 

45 The method :meth:`~pycommons.io.parser.Parser.parse_file` is a convenient 

46 wrapper that processes a single file in *exactly the same way*. 

47 The method :meth:`~pycommons.io.parser.Parser.parse_directory` parses a 

48 path that identifies a directory. 

49 

50 This class offers an internal API, where the internal functions are 

51 prefixed with `_`, that allows you to customize the hierarchical parsing 

52 process to a high degree. You can decide which directories and files to 

53 process, and you can set up and tear down datastructures on a per-file or 

54 per-directory basis. All the internal functions are invoked in a 

55 consistent way, regardless whether you parse single files or nested 

56 directories. 

57 """ 

58 

59 def _start_parse(self, root: Path) -> None: 

60 """ 

61 Begin the parsing process. 

62 

63 This method is called before the recursing parsing begins. It can be 

64 used to initialize any internal datastructures to make the parser 

65 reusable. 

66 

67 :param root: the root path of the parsing process 

68 """ 

69 

70 # pylint: disable=W0613 

71 def _should_parse_file(self, file: Path) -> bool: # noqa: ARG002 

72 """ 

73 Check whether we should start parsing a file. 

74 

75 The other file-parsing routines are only called if this method returns 

76 `True` for a file. Any overriding method should first call the super 

77 method. 

78 

79 :param file: the current file path 

80 :returns: `True` if the file should be parsed, `False` otherwise 

81 """ 

82 return True 

83 

84 def _start_parse_file(self, file: Path) -> None: 

85 """ 

86 Check whether we should start parsing a file. 

87 

88 Any method overriding this method should first invoke the super method 

89 and then perform its own startup code. 

90 

91 :param file: the current file path 

92 """ 

93 

94 # pylint: disable=W0613 

95 def _parse_file(self, file: Path) -> T | None: # noqa: ARG002 

96 """ 

97 Parse a file and return the result. 

98 

99 :param file: the current file path 

100 :returns: the parsing result 

101 """ 

102 return None 

103 

104 def _end_parse_file(self, file: Path) -> None: 

105 """ 

106 Cleanup after a file has been parsed. 

107 

108 Any method overriding this function should first perform its own 

109 cleanup and then call the super implementation. 

110 

111 :param file: the current file path 

112 """ 

113 

114 # pylint: disable=W0613 

115 def _should_list_directory(self, directory: Path) \ 

116 -> tuple[bool, bool]: # noqa: ARG002 

117 """ 

118 Check whether we should parse a directory. 

119 

120 This method is called whenever the parser enters a directory. 

121 It should return a :class:`tuple` of two :class:`bool` values. 

122 The first one indicates whether the sub-directories of this directory 

123 should be processed. `True` means that they are listed and processed. 

124 `False` means that they are skipped. The second Boolean value 

125 indicates whether the files inside the directory should be listed. 

126 `True` means that the files should be listed, `False` means that they 

127 are not. 

128 

129 Any overriding method should first call the super method. 

130 

131 :param directory: the current directory path 

132 :returns: A :class:`tuple` of two `bool` values, where the first one 

133 indicates whether sub-directories should be visited and the 

134 second one indicates whether files should be listed 

135 """ 

136 return True, True 

137 

138 def _start_list_directory(self, directory: Path) -> None: 

139 """ 

140 Prepare for listing a directory. 

141 

142 This method is only called if `_should_list_directory` returned 

143 `True`. 

144 

145 :param directory: the current directory path 

146 """ 

147 

148 def _end_list_directory(self, directory: Path) -> None: 

149 """ 

150 Clean up after a directory has been processed. 

151 

152 :param directory: the current directory path 

153 """ 

154 

155 def _end_parse(self, root: Path) -> None: 

156 """ 

157 End the parsing process. 

158 

159 This method can perform any cleanup and purging of internal 

160 datastructures to make the parser reusable. 

161 

162 :param root: the root path of the parsing process 

163 """ 

164 

165 def _progress_logger(self, text: str) -> None: 

166 """ 

167 Log the progress. 

168 

169 This method is called with a string that should be logged. By default, 

170 it forwards the string to :func:`logger`. 

171 

172 :param text: the test 

173 """ 

174 logger(text) 

175 

176 def __internal_parse(self, paths: Iterable[Path], log_progress: bool, 

177 is_root: bool) -> Generator[T, None, None]: 

178 """ 

179 Perform the internal parsing work. 

180 

181 This method should never be called directly. It is called by `parse`. 

182 

183 :param paths: the paths to parse. 

184 :param log_progress: should we log progress? 

185 :param is_root: is this the root of parsing 

186 :returns: the generator 

187 """ 

188 current: Path | None = None 

189 for current in paths: 

190 if current.is_file(): 

191 # The current path identifies a file. We need to check whether 

192 # this file should be parsed and, if so, parse it and yield 

193 # from the parsing results. 

194 should: bool = self._should_parse_file(current) 

195 if not isinstance(should, bool): # type check 

196 raise type_error(should, "should", bool) 

197 if should: # OK, the file should be parsed. 

198 self._start_parse_file(current) 

199 result: T | None = self._parse_file(current) 

200 if result is not None: # We got some result. 

201 yield result 

202 # Notify the end of parsing. 

203 self._end_parse_file(current) 

204 elif current.is_dir(): # The path is a directory. 

205 # Check if we should parse. 

206 list_dirs, list_files = self._should_list_directory(current) 

207 if not isinstance(list_dirs, bool): 

208 raise type_error( # wrong type 

209 list_dirs, "retval[1] of start_list_dir", bool) 

210 if not isinstance(list_files, bool): 

211 raise type_error( # wrong type 

212 list_files, "retval[2] of start_list_dir", bool) 

213 if list_dirs or list_files: 

214 self._start_list_directory(current) 

215 # add the current directory name 

216 if log_progress: 

217 self._progress_logger( 

218 f"entering directory {current!r}.") 

219 yield from self.__internal_parse(current.list_dir( 

220 list_files, list_dirs), log_progress, False) 

221 self._end_list_directory(current) 

222 if is_root: 

223 self._end_parse(current) 

224 if log_progress: 

225 self._progress_logger(f"finished parsing {current!r}.") 

226 

227 def parse(self, path: str, log_progress: bool = True) \ 

228 -> Generator[T, None, None]: 

229 """ 

230 Parse the given path. 

231 

232 :param path: the path to parse 

233 :param log_progress: should the progress be logged? 

234 :returns: the parsed sequence 

235 """ 

236 root: Final[Path] = Path(path) 

237 if not isinstance(log_progress, bool): 

238 raise type_error(log_progress, "log_progress", bool) 

239 

240 if log_progress: 

241 self._progress_logger(f"beginning to parse {root!r}.") 

242 self._start_parse(root) 

243 return self.__internal_parse((root, ), log_progress, True) 

244 

245 def parse_file(self, file: str, log_progress: bool = False) -> T: 

246 """ 

247 Parse a single file. 

248 

249 This method guarantees to not return `None`. If the internal parsing 

250 process yields `None` anyway, it will raise a :class:`TypeError`. 

251 It will also raise a :class:`ValueError` if `file` does not identify a 

252 file. 

253 

254 :param file: the file to parse 

255 :param log_progress: should the progress be logged? 

256 :returns: the parsing result. 

257 """ 

258 path: Final[Path] = file_path(file) 

259 try: 

260 return next(self.parse(path, log_progress)) 

261 except StopIteration as se: 

262 raise TypeError( 

263 f"result of parsing file {path!r} should not be None.")\ 

264 from se 

265 

266 def parse_directory(self, directory: str, log_progress: bool = True) \ 

267 -> Generator[T, None, None]: 

268 """ 

269 Parse a directory of files. 

270 

271 This function basically works exactly as 

272 :meth:`~pycommons.io.parser.Parser.parse`, but it enforces that 

273 `directory` is a directory and raises a :class:`ValueError` otherwise. 

274 

275 :param directory: the directory to parse 

276 :param log_progress: should the progress be logged? 

277 :returns: the generator with the parsing results 

278 """ 

279 return self.parse(directory_path(directory), log_progress)