Coverage for pycommons / io / parser.py: 100%
65 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-06-03 00:58 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-06-03 00:58 +0000
1"""
2A tool for recursively parsing data from directories.
4This module provides a unified API for parsing data from files in
5directories. The goal is to offer a way to return a generator that
6allows us to iterate over the data loaded. While we iterate over this
7data, the generator internally iterates over the files.
9This means that the control of how the data is loaded stays with the user,
10while the programmer can implement the necessary methods to load and process
11data in a natural way.
12"""
14from typing import Final, Generator, Iterable, TypeVar
16from pycommons.io.console import logger
17from pycommons.io.path import Path, directory_path, file_path
18from pycommons.types import type_error
20#: the type variable for data to be read from the directories
21T = TypeVar("T")
24class Parser[T]:
25 """
26 The parser class.
28 This class allows you to implement convenient parsing routines that can
29 hierarchically process nested directories of files and return a stream,
30 i.e., a :class:`Generator` of results. In other words, it flattens the
31 hierarchical processing of directories into a linear sequence of data.
32 This allows the user of the API stay in control of when the data is loaded
33 while the programmer of the parser API can work in a convenient way with
34 high-level abstractions. Another advantage of this parsing API is that its
35 results can be processed like a stream and be piped into some filters,
36 processors, or even output destinations while it is loaded from the files.
37 For example, we can extract certain elements of data from huge collections
38 of files and while they are loaded, they could already be processed and
39 stored to a stream of CSV data.
41 The method :meth:`~pycommons.io.parser.Parser.parse` can be applied to any
42 path to a file or directory and will hierarchically process the path and
43 yield the parsing results one by one. This is the normal entry point
44 function for this parsing API.
45 The method :meth:`~pycommons.io.parser.Parser.parse_file` is a convenient
46 wrapper that processes a single file in *exactly the same way*.
47 The method :meth:`~pycommons.io.parser.Parser.parse_directory` parses a
48 path that identifies a directory.
50 This class offers an internal API, where the internal functions are
51 prefixed with `_`, that allows you to customize the hierarchical parsing
52 process to a high degree. You can decide which directories and files to
53 process, and you can set up and tear down datastructures on a per-file or
54 per-directory basis. All the internal functions are invoked in a
55 consistent way, regardless whether you parse single files or nested
56 directories.
57 """
59 def _start_parse(self, root: Path) -> None:
60 """
61 Begin the parsing process.
63 This method is called before the recursing parsing begins. It can be
64 used to initialize any internal datastructures to make the parser
65 reusable.
67 :param root: the root path of the parsing process
68 """
70 # pylint: disable=W0613
71 def _should_parse_file(self, file: Path) -> bool: # noqa: ARG002
72 """
73 Check whether we should start parsing a file.
75 The other file-parsing routines are only called if this method returns
76 `True` for a file. Any overriding method should first call the super
77 method.
79 :param file: the current file path
80 :returns: `True` if the file should be parsed, `False` otherwise
81 """
82 return True
84 def _start_parse_file(self, file: Path) -> None:
85 """
86 Check whether we should start parsing a file.
88 Any method overriding this method should first invoke the super method
89 and then perform its own startup code.
91 :param file: the current file path
92 """
94 # pylint: disable=W0613
95 def _parse_file(self, file: Path) -> T | None: # noqa: ARG002
96 """
97 Parse a file and return the result.
99 :param file: the current file path
100 :returns: the parsing result
101 """
102 return None
104 def _end_parse_file(self, file: Path) -> None:
105 """
106 Cleanup after a file has been parsed.
108 Any method overriding this function should first perform its own
109 cleanup and then call the super implementation.
111 :param file: the current file path
112 """
114 # pylint: disable=W0613
115 def _should_list_directory(self, directory: Path) \
116 -> tuple[bool, bool]: # noqa: ARG002
117 """
118 Check whether we should parse a directory.
120 This method is called whenever the parser enters a directory.
121 It should return a :class:`tuple` of two :class:`bool` values.
122 The first one indicates whether the sub-directories of this directory
123 should be processed. `True` means that they are listed and processed.
124 `False` means that they are skipped. The second Boolean value
125 indicates whether the files inside the directory should be listed.
126 `True` means that the files should be listed, `False` means that they
127 are not.
129 Any overriding method should first call the super method.
131 :param directory: the current directory path
132 :returns: A :class:`tuple` of two `bool` values, where the first one
133 indicates whether sub-directories should be visited and the
134 second one indicates whether files should be listed
135 """
136 return True, True
138 def _start_list_directory(self, directory: Path) -> None:
139 """
140 Prepare for listing a directory.
142 This method is only called if `_should_list_directory` returned
143 `True`.
145 :param directory: the current directory path
146 """
148 def _end_list_directory(self, directory: Path) -> None:
149 """
150 Clean up after a directory has been processed.
152 :param directory: the current directory path
153 """
155 def _end_parse(self, root: Path) -> None:
156 """
157 End the parsing process.
159 This method can perform any cleanup and purging of internal
160 datastructures to make the parser reusable.
162 :param root: the root path of the parsing process
163 """
165 def _progress_logger(self, text: str) -> None:
166 """
167 Log the progress.
169 This method is called with a string that should be logged. By default,
170 it forwards the string to :func:`logger`.
172 :param text: the test
173 """
174 logger(text)
176 def __internal_parse(self, paths: Iterable[Path], log_progress: bool,
177 is_root: bool) -> Generator[T, None, None]:
178 """
179 Perform the internal parsing work.
181 This method should never be called directly. It is called by `parse`.
183 :param paths: the paths to parse.
184 :param log_progress: should we log progress?
185 :param is_root: is this the root of parsing
186 :returns: the generator
187 """
188 current: Path | None = None
189 for current in paths:
190 if current.is_file():
191 # The current path identifies a file. We need to check whether
192 # this file should be parsed and, if so, parse it and yield
193 # from the parsing results.
194 should: bool = self._should_parse_file(current)
195 if not isinstance(should, bool): # type check
196 raise type_error(should, "should", bool)
197 if should: # OK, the file should be parsed.
198 self._start_parse_file(current)
199 result: T | None = self._parse_file(current)
200 if result is not None: # We got some result.
201 yield result
202 # Notify the end of parsing.
203 self._end_parse_file(current)
204 elif current.is_dir(): # The path is a directory.
205 # Check if we should parse.
206 list_dirs, list_files = self._should_list_directory(current)
207 if not isinstance(list_dirs, bool):
208 raise type_error( # wrong type
209 list_dirs, "retval[1] of start_list_dir", bool)
210 if not isinstance(list_files, bool):
211 raise type_error( # wrong type
212 list_files, "retval[2] of start_list_dir", bool)
213 if list_dirs or list_files:
214 self._start_list_directory(current)
215 # add the current directory name
216 if log_progress:
217 self._progress_logger(
218 f"entering directory {current!r}.")
219 yield from self.__internal_parse(current.list_dir(
220 list_files, list_dirs), log_progress, False)
221 self._end_list_directory(current)
222 if is_root:
223 self._end_parse(current)
224 if log_progress:
225 self._progress_logger(f"finished parsing {current!r}.")
227 def parse(self, path: str, log_progress: bool = True) \
228 -> Generator[T, None, None]:
229 """
230 Parse the given path.
232 :param path: the path to parse
233 :param log_progress: should the progress be logged?
234 :returns: the parsed sequence
235 """
236 root: Final[Path] = Path(path)
237 if not isinstance(log_progress, bool):
238 raise type_error(log_progress, "log_progress", bool)
240 if log_progress:
241 self._progress_logger(f"beginning to parse {root!r}.")
242 self._start_parse(root)
243 return self.__internal_parse((root, ), log_progress, True)
245 def parse_file(self, file: str, log_progress: bool = False) -> T:
246 """
247 Parse a single file.
249 This method guarantees to not return `None`. If the internal parsing
250 process yields `None` anyway, it will raise a :class:`TypeError`.
251 It will also raise a :class:`ValueError` if `file` does not identify a
252 file.
254 :param file: the file to parse
255 :param log_progress: should the progress be logged?
256 :returns: the parsing result.
257 """
258 path: Final[Path] = file_path(file)
259 try:
260 return next(self.parse(path, log_progress))
261 except StopIteration as se:
262 raise TypeError(
263 f"result of parsing file {path!r} should not be None.")\
264 from se
266 def parse_directory(self, directory: str, log_progress: bool = True) \
267 -> Generator[T, None, None]:
268 """
269 Parse a directory of files.
271 This function basically works exactly as
272 :meth:`~pycommons.io.parser.Parser.parse`, but it enforces that
273 `directory` is a directory and raises a :class:`ValueError` otherwise.
275 :param directory: the directory to parse
276 :param log_progress: should the progress be logged?
277 :returns: the generator with the parsing results
278 """
279 return self.parse(directory_path(directory), log_progress)