Example files that cause an unexpected delay iterating (infinite?) matches
jspraul opened this issue · comments
On Windows with python 3.11 and polyfile 0.5.2, processing the following files as demonstrated in the README seems to take forever:
- https://github.com/ahupp/python-magic/files/9231524/memblock.txt (1,399 bytes)
- https://github.com/ggerganov/whisper.cpp/blob/master/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java (4,178 bytes)
python-magic using libmagic v4 from years ago spits out an error mentioning regex and memory.
code
import polyfile
def from_file(file):
print(file)
with open(file, "rb") as f:
# the default instance automatically loads all file definitions
for match in polyfile.magic.MagicMatcher.DEFAULT_INSTANCE.match(f.read()):
for mimetype in match.mimetypes:
print(f"Matched MIME: {mimetype}", flush=True)
print(f"Match string: {match!s}", flush=True)
from_file("test3.py")
from_file("memblock.txt")
from_file("whisper.cpp/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java")
output including stack trace after Ctrl+C after waiting ~5 minutes
09/05/2023 01:45:27 C:\Users\WDAGUtilityAccount\Desktop> python.exe .\test3.py
test3.py
Matched MIME: text/plain
Match string: ascii text
memblock.txt
Matched MIME: text/x-c
Match string: C source text
Traceback (most recent call last):
File "C:\Users\WDAGUtilityAccount\Desktop\test3.py", line 13, in <module>
from_file("memblock.txt")
File "C:\Users\WDAGUtilityAccount\Desktop\test3.py", line 7, in from_file
for match in polyfile.magic.MagicMatcher.DEFAULT_INSTANCE.match(f.read()):
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2742, in match
if m and (not to_match.only_match_mime or any(t is not None for t in m.mimetypes)):
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2513, in __bool__
return any(m for m in self.mimetypes) or any(e for e in self.extensions) or bool(self.message())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2513, in <genexpr>
return any(m for m in self.mimetypes) or any(e for e in self.extensions) or bool(self.message())
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\iterators.py", line 44, in __iter__
yield self[i]
~~~~^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\iterators.py", line 30, in __getitem__
self._items.append(next(self._source_iter))
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\iterators.py", line 54, in unique
for t in iterator:
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2493, in <genexpr>
return LazyIterableSet((
^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2543, in __iter__
yield self[i]
~~~~^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2527, in __getitem__
result = next(self._result_iter)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 928, in _match
yield from child._match(context=context, parent_match=m)
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 917, in _match
m = self.test(context.data, absolute_offset, parent_match)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2103, in test
match = self.data_type.match(data[absolute_offset:], self.constant)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 1767, in match
m = expected.search(data[:self.length])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
09/05/2023 01:50:32 C:\Users\WDAGUtilityAccount\Desktop>