Spacy and Berkeley parser Multi-processing
hardianlawi opened this issue · comments
I'm trying to make the multi-processing spacy pipeline works with the berkeley parser as I assume it will boost the performance. How can I get it to work? I tried the suggestion from here, but it didn't work for me.
import multiprocessing as mp
import torch
mp.set_start_method('spawn')
torch.set_num_threads(1)
import spacy
import benepar
benepar.download("benepar_en3")
nlp = spacy.load("en_core_web_md")
if spacy.__version__.startswith("2"):
nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
nlp.add_pipe("benepar", config={"model": "benepar_en3"})
docs = nlp.pipe(['The time for action is now. It is never too late to do something.', 'The time for action is now. It is never too late to do something.'], n_process=2)
Error message
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-2-d850341268c5> in <module>
----> 1 for doc in docs:
2 break
3
~/miniforge3/envs/pod-classification/lib/python3.9/site-packages/spacy/language.py in pipe(self, texts, as_tuples, batch_size, disable, component_cfg, n_process)
1482 for pipe in pipes:
1483 docs = pipe(docs)
-> 1484 for doc in docs:
1485 yield doc
1486
~/miniforge3/envs/pod-classification/lib/python3.9/site-packages/spacy/language.py in _multiprocessing_pipe(self, texts, pipes, n_process, batch_size)
1518 ]
1519 for proc in procs:
-> 1520 proc.start()
1521
1522 # Cycle channels not to break the order of docs.
~/miniforge3/envs/pod-classification/lib/python3.9/multiprocessing/process.py in start(self)
119 'daemonic processes are not allowed to have children'
120 _cleanup()
--> 121 self._popen = self._Popen(self)
122 self._sentinel = self._popen.sentinel
123 # Avoid a refcycle if the target function holds an indirect
~/miniforge3/envs/pod-classification/lib/python3.9/multiprocessing/context.py in _Popen(process_obj)
222 @staticmethod
223 def _Popen(process_obj):
--> 224 return _default_context.get_context().Process._Popen(process_obj)
225
226 class DefaultContext(BaseContext):
~/miniforge3/envs/pod-classification/lib/python3.9/multiprocessing/context.py in _Popen(process_obj)
282 def _Popen(process_obj):
283 from .popen_spawn_posix import Popen
--> 284 return Popen(process_obj)
285
286 class ForkServerProcess(process.BaseProcess):
~/miniforge3/envs/pod-classification/lib/python3.9/multiprocessing/popen_spawn_posix.py in __init__(self, process_obj)
30 def __init__(self, process_obj):
31 self._fds = []
---> 32 super().__init__(process_obj)
33
34 def duplicate_for_child(self, fd):
~/miniforge3/envs/pod-classification/lib/python3.9/multiprocessing/popen_fork.py in __init__(self, process_obj)
17 self.returncode = None
18 self.finalizer = None
---> 19 self._launch(process_obj)
20
21 def duplicate_for_child(self, fd):
~/miniforge3/envs/pod-classification/lib/python3.9/multiprocessing/popen_spawn_posix.py in _launch(self, process_obj)
45 try:
46 reduction.dump(prep_data, fp)
---> 47 reduction.dump(process_obj, fp)
48 finally:
49 set_spawning_popen(None)
~/miniforge3/envs/pod-classification/lib/python3.9/multiprocessing/reduction.py in dump(obj, file, protocol)
58 def dump(obj, file, protocol=None):
59 '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60 ForkingPickler(file, protocol).dump(obj)
61
62 #
AttributeError: Can't pickle local object 'install_spacy_extensions.<locals>.<lambda>'
I also tried running the spacy pipeline using GPU by adding the codes below, but it does not seem to give much boost.
import spacy
import torch
spacy.prefer_gpu()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
The GPU did not improve any performance, I guess it's because the data still have to be processed by the spacy first.
If you initiate 2 nlp modules in spacy, one is normal modules and another one is a module that combined with the benepar, when you process sentences, the normal modules goes through all process and the benepar only uses parser module, this will give you roughly 2x processing speed.
nlp1 = spacy.load('en_core_web_md')
nlp2 = spacy.load('en_core_web_sm')
if spacy.__version__.startswith('2'):
nlp2.add_pipe(BeneparComponent("benepar_en3"))
else:
nlp2.add_pipe("benepar", config={"model": "benepar_en3"})
docs_nlp1 = list(nlp1.pipe(examples['sentence'], disable=["tok2vec"], n_process=4))
docs_nlp2 = list(nlp2.pipe(examples['sentence'], disable=["tok2vec","tagger","ner","lemmatizer","textcat"]))
If you use multi-thread to process it independently, I guess the speed will be further improved to 4x compared with the original speed. However, I do not know how to work spacy in multi-thread situations.