Coverage for pycommons / io / parser.py: 100%
65 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-11 03:04 +0000
1"""
2A tool for recursively parsing data from directories.
4This module provides a unified API for parsing data from files in
5directories. The goal is to offer a way to return a generator that
6allows us to iterate over the data loaded. While we iterate over this
7data, the generator internally iterates over the files.
9This means that the control of how the data is loaded stays with the user,
10while the programmer can implement the necessary methods to load and process
11data in a natural way.
12"""
14from typing import Final, Generator, Iterable, TypeVar
16from pycommons.io.console import logger
17from pycommons.io.path import Path, directory_path, file_path
18from pycommons.types import type_error
20#: the type variable for data to be read from the directories
21T = TypeVar("T")
24class Parser[T]:
25 """
26 The parser class.
28 This class allows you to implement convenient parsing routines that can
29 hierarchically process nested directories of files and return a stream,
30 i.e., a :class:`Generator` of results. In other words, it flattens the
31 hierarchical processing of directories into a linear sequence of data.
32 This allows the user of the API stay in control of when the data is loaded
33 while the programmer of the parser API can work in a convenient way with
34 high-level abstractions. Another advantage of this parsing API is that its
35 results can be processed like a stream and be piped into some filters,
36 processors, or even output destinations while it is loaded from the files.
37 For example, we can extract certain elements of data from huge collections
38 of files and while they are loaded, they could already be processed and
39 stored to a stream of CSV data.
41 The method :meth:`~pycommons.io.parser.Parser.parse` can be applied to any
42 path to a file or directory and will hierarchically process the path and
43 yield the parsing results one by one. This is the normal entry point
44 function for this parsing API.
45 The method :meth:`~pycommons.io.parser.Parser.parse_file` is a convenient
46 wrapper that processes a single file in *exactly the same way*.
47 The method :meth:`~pycommons.io.parser.Parser.parse_directory` parses a
48 path that identifies a directory.
50 This class offers an internal API, where the internal functions are
51 prefixed with `_`, that allows you to customize the hierarchical parsing
52 process to a high degree. You can decide which directories and files to
53 process, and you can set up and tear down datastructures on a per-file or
54 per-directory basis. All the internal functions are invoked in a
55 consistent way, regardless whether you parse single files or nested
56 directories.
57 """
59 def _start_parse(self, root: Path) -> None:
60 """
61 Begin the parsing process.
63 This method is called before the recursing parsing begins. It can be
64 used to initialize any internal datastructures to make the parser
65 reusable.
67 :param root: the root path of the parsing process
68 """
70 # pylint: disable=W0613
71 def _should_parse_file(self, file: Path) -> bool: # noqa: ARG002
72 """
73 Check whether we should start parsing a file.
75 The other file-parsing routines are only called if this method returns
76 `True` for a file. Any overriding method should first call the super
77 method.
79 :param file: the current file path
80 :returns: `True` if the file should be parsed, `False` otherwise
81 """
82 return True
84 def _start_parse_file(self, file: Path) -> None:
85 """
86 Check whether we should start parsing a file.
88 Any method overriding this method should first invoke the super method
89 and then perform its own startup code.
91 :param file: the current file path
92 """
94 # pylint: disable=W0613
95 def _parse_file(self, file: Path) -> T | None: # noqa: ARG002
96 """
97 Parse a file and return the result.
99 :param file: the current file path
100 :returns: the parsing result
101 """
102 return None
104 def _end_parse_file(self, file: Path) -> None:
105 """
106 Cleanup after a file has been parsed.
108 Any method overriding this function should first perform its own
109 cleanup and then call the super implementation.
111 :param file: the current file path
112 """
114 # pylint: disable=W0613
115 def _should_list_directory(self, directory: Path) \
116 -> tuple[bool, bool]: # noqa: ARG002
117 """
118 Check whether we should parse a directory.
120 This method is called whenever the parser enters a directory.
121 It should return a :class:`tuple` of two :class:`bool` values.
122 The first one indicates whether the sub-directories of this directory
123 should be processed. `True` means that they are listed and processed.
124 `False` means that they are skipped. The second Boolean value
125 indicates whether the files inside the directory should be listed.
126 `True` means that the files should be listed, `False` means that they
127 are not.
129 Any overriding method should first call the super method.
131 :param directory: the current directory path
132 :returns: A :class:`tuple` of two `bool` values, where the first one
133 indicates whether sub-directories should be visited and the
134 second one indicates whether files should be listed
135 """
136 return True, True
138 def _start_list_directory(self, directory: Path) -> None:
139 """
140 Prepare for listing a directory.
142 This method is only called if `_should_list_directory` returned
143 `True`.
145 :param directory: the current directory path
146 """
148 def _end_list_directory(self, directory: Path) -> None:
149 """
150 Clean up after a directory has been processed.
152 :param directory: the current directory path
153 """
155 def _end_parse(self, root: Path) -> None:
156 """
157 End the parsing process.
159 This method can perform any cleanup and purging of internal
160 datastructures to make the parser reusable.
162 :param root: the root path of the parsing process
163 """
165 def _progress_logger(self, text: str) -> None:
166 """
167 Log the progress.
169 This method is called with a string that should be logged. By default,
170 it forwards the string to :func:`logger`.
172 :param text: the test
173 """
174 logger(text)
176 def __internal_parse(self, paths: Iterable[Path], log_progress: bool,
177 is_root: bool) \
178 -> Generator[T, None, None]:
179 """
180 Perform the internal parsing work.
182 This method should never be called directly. It is called by `parse`.
184 :param paths: the paths to parse.
185 :param log_progress: should we log progress?
186 :param is_root: is this the root of parsing
187 :returns: the generator
188 """
189 current: Path | None = None
190 for current in paths:
191 if current.is_file():
192 # The current path identifies a file. We need to check whether
193 # this file should be parsed and, if so, parse it and yield
194 # from the parsing results.
195 should: bool = self._should_parse_file(current)
196 if not isinstance(should, bool): # type check
197 raise type_error(should, "should", bool)
198 if should: # OK, the file should be parsed.
199 self._start_parse_file(current)
200 result: T | None = self._parse_file(current)
201 if result is not None: # We got some result.
202 yield result
203 # Notify the end of parsing.
204 self._end_parse_file(current)
205 elif current.is_dir(): # The path is a directory.
206 # Check if we should parse.
207 list_dirs, list_files = self._should_list_directory(current)
208 if not isinstance(list_dirs, bool):
209 raise type_error( # wrong type
210 list_dirs, "retval[1] of start_list_dir", bool)
211 if not isinstance(list_files, bool):
212 raise type_error( # wrong type
213 list_files, "retval[2] of start_list_dir", bool)
214 if list_dirs or list_files:
215 self._start_list_directory(current)
216 # add the current directory name
217 if log_progress:
218 self._progress_logger(
219 f"entering directory {current!r}.")
220 yield from self.__internal_parse(current.list_dir(
221 list_files, list_dirs), log_progress, False)
222 self._end_list_directory(current)
223 if is_root:
224 self._end_parse(current)
225 if log_progress:
226 self._progress_logger(f"finished parsing {current!r}.")
228 def parse(self, path: str, log_progress: bool = True) \
229 -> Generator[T, None, None]:
230 """
231 Parse the given path.
233 :param path: the path to parse
234 :param log_progress: should the progress be logged?
235 :returns: the parsed sequence
236 """
237 root: Final[Path] = Path(path)
238 if not isinstance(log_progress, bool):
239 raise type_error(log_progress, "log_progress", bool)
241 if log_progress:
242 self._progress_logger(f"beginning to parse {root!r}.")
243 self._start_parse(root)
244 return self.__internal_parse((root, ), log_progress, True)
246 def parse_file(self, file: str, log_progress: bool = False) -> T:
247 """
248 Parse a single file.
250 This method guarantees to not return `None`. If the internal parsing
251 process yields `None` anyway, it will raise a :class:`TypeError`.
252 It will also raise a :class:`ValueError` if `file` does not identify a
253 file.
255 :param file: the file to parse
256 :param log_progress: should the progress be logged?
257 :returns: the parsing result.
258 """
259 path: Final[Path] = file_path(file)
260 try:
261 return next(self.parse(path, log_progress))
262 except StopIteration as se:
263 raise TypeError(
264 f"result of parsing file {path!r} should not be None.")\
265 from se
267 def parse_directory(self, directory: str, log_progress: bool = True) \
268 -> Generator[T, None, None]:
269 """
270 Parse a directory of files.
272 This function basically works exactly as
273 :meth:`~pycommons.io.parser.Parser.parse`, but it enforces that
274 `directory` is a directory and raises a :class:`ValueError` otherwise.
276 :param directory: the directory to parse
277 :param log_progress: should the progress be logged?
278 :returns: the generator with the parsing results
279 """
280 return self.parse(directory_path(directory), log_progress)