Coverage for pycommons / io / parser.py: 100%

65 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-11 03:04 +0000

1""" 

2A tool for recursively parsing data from directories. 

3 

4This module provides a unified API for parsing data from files in 

5directories. The goal is to offer a way to return a generator that 

6allows us to iterate over the data loaded. While we iterate over this 

7data, the generator internally iterates over the files. 

8 

9This means that the control of how the data is loaded stays with the user, 

10while the programmer can implement the necessary methods to load and process 

11data in a natural way. 

12""" 

13 

14from typing import Final, Generator, Iterable, TypeVar 

15 

16from pycommons.io.console import logger 

17from pycommons.io.path import Path, directory_path, file_path 

18from pycommons.types import type_error 

19 

20#: the type variable for data to be read from the directories 

21T = TypeVar("T") 

22 

23 

24class Parser[T]: 

25 """ 

26 The parser class. 

27 

28 This class allows you to implement convenient parsing routines that can 

29 hierarchically process nested directories of files and return a stream, 

30 i.e., a :class:`Generator` of results. In other words, it flattens the 

31 hierarchical processing of directories into a linear sequence of data. 

32 This allows the user of the API stay in control of when the data is loaded 

33 while the programmer of the parser API can work in a convenient way with 

34 high-level abstractions. Another advantage of this parsing API is that its 

35 results can be processed like a stream and be piped into some filters, 

36 processors, or even output destinations while it is loaded from the files. 

37 For example, we can extract certain elements of data from huge collections 

38 of files and while they are loaded, they could already be processed and 

39 stored to a stream of CSV data. 

40 

41 The method :meth:`~pycommons.io.parser.Parser.parse` can be applied to any 

42 path to a file or directory and will hierarchically process the path and 

43 yield the parsing results one by one. This is the normal entry point 

44 function for this parsing API. 

45 The method :meth:`~pycommons.io.parser.Parser.parse_file` is a convenient 

46 wrapper that processes a single file in *exactly the same way*. 

47 The method :meth:`~pycommons.io.parser.Parser.parse_directory` parses a 

48 path that identifies a directory. 

49 

50 This class offers an internal API, where the internal functions are 

51 prefixed with `_`, that allows you to customize the hierarchical parsing 

52 process to a high degree. You can decide which directories and files to 

53 process, and you can set up and tear down datastructures on a per-file or 

54 per-directory basis. All the internal functions are invoked in a 

55 consistent way, regardless whether you parse single files or nested 

56 directories. 

57 """ 

58 

59 def _start_parse(self, root: Path) -> None: 

60 """ 

61 Begin the parsing process. 

62 

63 This method is called before the recursing parsing begins. It can be 

64 used to initialize any internal datastructures to make the parser 

65 reusable. 

66 

67 :param root: the root path of the parsing process 

68 """ 

69 

70 # pylint: disable=W0613 

71 def _should_parse_file(self, file: Path) -> bool: # noqa: ARG002 

72 """ 

73 Check whether we should start parsing a file. 

74 

75 The other file-parsing routines are only called if this method returns 

76 `True` for a file. Any overriding method should first call the super 

77 method. 

78 

79 :param file: the current file path 

80 :returns: `True` if the file should be parsed, `False` otherwise 

81 """ 

82 return True 

83 

84 def _start_parse_file(self, file: Path) -> None: 

85 """ 

86 Check whether we should start parsing a file. 

87 

88 Any method overriding this method should first invoke the super method 

89 and then perform its own startup code. 

90 

91 :param file: the current file path 

92 """ 

93 

94 # pylint: disable=W0613 

95 def _parse_file(self, file: Path) -> T | None: # noqa: ARG002 

96 """ 

97 Parse a file and return the result. 

98 

99 :param file: the current file path 

100 :returns: the parsing result 

101 """ 

102 return None 

103 

104 def _end_parse_file(self, file: Path) -> None: 

105 """ 

106 Cleanup after a file has been parsed. 

107 

108 Any method overriding this function should first perform its own 

109 cleanup and then call the super implementation. 

110 

111 :param file: the current file path 

112 """ 

113 

114 # pylint: disable=W0613 

115 def _should_list_directory(self, directory: Path) \ 

116 -> tuple[bool, bool]: # noqa: ARG002 

117 """ 

118 Check whether we should parse a directory. 

119 

120 This method is called whenever the parser enters a directory. 

121 It should return a :class:`tuple` of two :class:`bool` values. 

122 The first one indicates whether the sub-directories of this directory 

123 should be processed. `True` means that they are listed and processed. 

124 `False` means that they are skipped. The second Boolean value 

125 indicates whether the files inside the directory should be listed. 

126 `True` means that the files should be listed, `False` means that they 

127 are not. 

128 

129 Any overriding method should first call the super method. 

130 

131 :param directory: the current directory path 

132 :returns: A :class:`tuple` of two `bool` values, where the first one 

133 indicates whether sub-directories should be visited and the 

134 second one indicates whether files should be listed 

135 """ 

136 return True, True 

137 

138 def _start_list_directory(self, directory: Path) -> None: 

139 """ 

140 Prepare for listing a directory. 

141 

142 This method is only called if `_should_list_directory` returned 

143 `True`. 

144 

145 :param directory: the current directory path 

146 """ 

147 

148 def _end_list_directory(self, directory: Path) -> None: 

149 """ 

150 Clean up after a directory has been processed. 

151 

152 :param directory: the current directory path 

153 """ 

154 

155 def _end_parse(self, root: Path) -> None: 

156 """ 

157 End the parsing process. 

158 

159 This method can perform any cleanup and purging of internal 

160 datastructures to make the parser reusable. 

161 

162 :param root: the root path of the parsing process 

163 """ 

164 

165 def _progress_logger(self, text: str) -> None: 

166 """ 

167 Log the progress. 

168 

169 This method is called with a string that should be logged. By default, 

170 it forwards the string to :func:`logger`. 

171 

172 :param text: the test 

173 """ 

174 logger(text) 

175 

176 def __internal_parse(self, paths: Iterable[Path], log_progress: bool, 

177 is_root: bool) \ 

178 -> Generator[T, None, None]: 

179 """ 

180 Perform the internal parsing work. 

181 

182 This method should never be called directly. It is called by `parse`. 

183 

184 :param paths: the paths to parse. 

185 :param log_progress: should we log progress? 

186 :param is_root: is this the root of parsing 

187 :returns: the generator 

188 """ 

189 current: Path | None = None 

190 for current in paths: 

191 if current.is_file(): 

192 # The current path identifies a file. We need to check whether 

193 # this file should be parsed and, if so, parse it and yield 

194 # from the parsing results. 

195 should: bool = self._should_parse_file(current) 

196 if not isinstance(should, bool): # type check 

197 raise type_error(should, "should", bool) 

198 if should: # OK, the file should be parsed. 

199 self._start_parse_file(current) 

200 result: T | None = self._parse_file(current) 

201 if result is not None: # We got some result. 

202 yield result 

203 # Notify the end of parsing. 

204 self._end_parse_file(current) 

205 elif current.is_dir(): # The path is a directory. 

206 # Check if we should parse. 

207 list_dirs, list_files = self._should_list_directory(current) 

208 if not isinstance(list_dirs, bool): 

209 raise type_error( # wrong type 

210 list_dirs, "retval[1] of start_list_dir", bool) 

211 if not isinstance(list_files, bool): 

212 raise type_error( # wrong type 

213 list_files, "retval[2] of start_list_dir", bool) 

214 if list_dirs or list_files: 

215 self._start_list_directory(current) 

216 # add the current directory name 

217 if log_progress: 

218 self._progress_logger( 

219 f"entering directory {current!r}.") 

220 yield from self.__internal_parse(current.list_dir( 

221 list_files, list_dirs), log_progress, False) 

222 self._end_list_directory(current) 

223 if is_root: 

224 self._end_parse(current) 

225 if log_progress: 

226 self._progress_logger(f"finished parsing {current!r}.") 

227 

228 def parse(self, path: str, log_progress: bool = True) \ 

229 -> Generator[T, None, None]: 

230 """ 

231 Parse the given path. 

232 

233 :param path: the path to parse 

234 :param log_progress: should the progress be logged? 

235 :returns: the parsed sequence 

236 """ 

237 root: Final[Path] = Path(path) 

238 if not isinstance(log_progress, bool): 

239 raise type_error(log_progress, "log_progress", bool) 

240 

241 if log_progress: 

242 self._progress_logger(f"beginning to parse {root!r}.") 

243 self._start_parse(root) 

244 return self.__internal_parse((root, ), log_progress, True) 

245 

246 def parse_file(self, file: str, log_progress: bool = False) -> T: 

247 """ 

248 Parse a single file. 

249 

250 This method guarantees to not return `None`. If the internal parsing 

251 process yields `None` anyway, it will raise a :class:`TypeError`. 

252 It will also raise a :class:`ValueError` if `file` does not identify a 

253 file. 

254 

255 :param file: the file to parse 

256 :param log_progress: should the progress be logged? 

257 :returns: the parsing result. 

258 """ 

259 path: Final[Path] = file_path(file) 

260 try: 

261 return next(self.parse(path, log_progress)) 

262 except StopIteration as se: 

263 raise TypeError( 

264 f"result of parsing file {path!r} should not be None.")\ 

265 from se 

266 

267 def parse_directory(self, directory: str, log_progress: bool = True) \ 

268 -> Generator[T, None, None]: 

269 """ 

270 Parse a directory of files. 

271 

272 This function basically works exactly as 

273 :meth:`~pycommons.io.parser.Parser.parse`, but it enforces that 

274 `directory` is a directory and raises a :class:`ValueError` otherwise. 

275 

276 :param directory: the directory to parse 

277 :param log_progress: should the progress be logged? 

278 :returns: the generator with the parsing results 

279 """ 

280 return self.parse(directory_path(directory), log_progress)