Memory map error
tommedema opened this issue · comments
I'm running a single host with 80 cores to do some data science calculations.
My machine has 128GB of RAM but unfortunately ipyparallel is going way beyond that when I run it with 80 cores, it's hitting the limit at about 40-50 cores and then crashes. I'd push numpy arrays to the engines with dview.push.
I then read about the memmap method and thought it was worth a shot.
So on the host I am now writing to the memfile:
# update memfile for engines
np.save(clusterMemFilePath, np.array([qois, trainingSampleSeries, testingSampleSeries, sampleAdditions], dtype = object))
And on the engine I am reading the memfile:
gs = globals()
# load from mem file instead of writing to all engines
# see https://ipyparallel.readthedocs.io/en/stable/examples/broadcast/memmap%20Broadcast.html
try:
qois = gs['qois']
trainingSampleSeries = gs['trainingSampleSeries']
testingSampleSeries = gs['testingSampleSeries']
sampleAdditions = gs['sampleAdditions']
except KeyError:
qois, trainingSampleSeries, testingSampleSeries, sampleAdditions = np.memmap(clusterMemFilePath, mode = 'r+', dtype = object, shape = (4,))
gs['qois'] = qois
gs['trainingSampleSeries'] = trainingSampleSeries
gs['testingSampleSeries'] = testingSampleSeries
gs['sampleAdditions'] = sampleAdditions
I also added @ipp.interactive as a decorator for the engine function.
However this is resulting in the cluster crashing:
engine set stopped 1665014752: {'engines': {'9': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42356, 'identifier': '9'}, '3': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42325, 'identifier': '3'}, '6': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42341, 'identifier': '6'}, '0': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42318, 'identifier': '0'}, '7': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42344, 'identifier': '7'}, '5': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42333, 'identifier': '5'}, '4': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42329, 'identifier': '4'}, '8': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42350, 'identifier': '8'}, '2': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42320, 'identifier': '2'}, '1': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42319, 'identifier': '1'}}, 'exit_code': <Negsignal.SIGSEGV: -11>}
WARNING:ipyparallel.cluster.cluster.1665014751-102m:engine set stopped 1665014752: {'engines': {'9': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42356, 'identifier': '9'}, '3': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42325, 'identifier': '3'}, '6': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42341, 'identifier': '6'}, '0': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42318, 'identifier': '0'}, '7': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42344, 'identifier': '7'}, '5': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42333, 'identifier': '5'}, '4': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42329, 'identifier': '4'}, '8': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42350, 'identifier': '8'}, '2': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42320, 'identifier': '2'}, '1': {'exit_code': <Negsignal.SIGSEGV: -11>, 'pid': 42319, 'identifier': '1'}}, 'exit_code': <Negsignal.SIGSEGV: -11>}
[Engine Exception]
Traceback (most recent call last):
File "/Users/playground/.pyenv/versions/3.10.6/lib/python3.10/site-packages/ipyparallel/controller/task_scheduler.py", line 316, in handle_stranded_tasks
raise error.EngineError(
ipyparallel.error.EngineError: Engine b'15137a1a-2f0b1f8aae1b46b434d5db58' died while running task '1dfbdda4-56496e7ce5cec7f43f532f86_42290_11'
Do you have any advice on how I can make sure the arrays are not copied? Since I am on a single machine each process should read from the same place in memory rather than copying it 80 times.
I also tried copying the literal example at https://github.com/jupyter/ngcm-tutorial/blob/524ea8065de366c6e7f04121718cb96c0ed139c5/Part-3/examples/memmap.ipynb but this gives me:
KeyError: 'data'
On the engine, it seems it cannot access the global set:
@ipp.interactive
def parallelTrainTestQuery(queryIndex):
import numpy as np
# load from mem file instead of writing to all engines
# see https://ipyparallel.readthedocs.io/en/stable/examples/broadcast/memmap%20Broadcast.html
try:
data = globals()[clusterMemDataName]
print(data)
display(data)
qois = data[0]
trainingSampleSeries = data[1]
testingSampleSeries = data[2]
sampleAdditions = data[3]
except KeyError:
raise Exception('You must call bcast_memmap, see https://github.com/jupyter/ngcm-tutorial/blob/524ea8065de366c6e7f04121718cb96c0ed139c5/Part-3/examples/memmap.ipynb')
I also tried the variation at https://ipyparallel.readthedocs.io/en/stable/examples/broadcast/memmap%20Broadcast.html?highlight=memory#More-efficient-broadcast-of-arrays-with-memmap and this gives another error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File <timed exec>:174
File <timed exec>:121, in computeNextFold(iterationCount, nextQueryIndices)
Cell In [30], line 27, in bcast_memmap(data, name, client, host_engines)
24 # actually push the data, just once to each machine
25 memmap_path_name = f"_bcast_array_{name}"
---> 27 one_per_host = rc.broadcast_view([engines[0] for engines in host_engines.values()], coalescing=True)
28 send_ar = one_per_host.apply_async(array_to_file, data, name=memmap_path_name)
30 # load the data on all engines into a memmapped array
File ~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/ipyparallel/client/client.py:2088, in Client.broadcast_view(self, targets, is_coalescing, **kwargs)
2075 """construct a BroadCastView object.
2076 If no arguments are specified, create a BroadCastView using all engines
2077 using all engines.
(...)
2084 **kwargs : passed to BroadCastView
2085 """
2086 targets = self._build_targets(targets)[1]
-> 2088 bcast_view = BroadcastView(
2089 client=self,
2090 socket=self._broadcast_stream,
2091 targets=targets,
2092 **kwargs,
2093 )
2094 bcast_view.is_coalescing = is_coalescing
2095 return bcast_view
File ~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/ipyparallel/client/view.py:392, in DirectView.__init__(self, client, socket, targets, **flags)
391 def __init__(self, client=None, socket=None, targets=None, **flags):
--> 392 super().__init__(client=client, socket=socket, targets=targets, **flags)
File ~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/ipyparallel/client/view.py:117, in View.__init__(self, client, socket, **flags)
114 self.block = client.block
115 self.executor = ViewExecutor(self)
--> 117 self.set_flags(**flags)
119 assert not self.__class__ is View, "Don't use base View objects, use subclasses"
File ~/.pyenv/versions/3.10.6/lib/python3.10/site-packages/ipyparallel/client/view.py:152, in View.set_flags(self, **kwargs)
150 for name, value in kwargs.items():
151 if name not in self._flag_names:
--> 152 raise KeyError("Invalid name: %r" % name)
153 else:
154 setattr(self, name, value)
KeyError: "Invalid name: 'coalescing'"
I then changed coalescing
to is_coalescing
and now I get the KeyError: 'data'
again.
I thought this may be because of the f"_bcast_array_{name}"
prefix, so I added that too, but it's still not found. I then printed all globals and this is what I got:
['__name__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', '_', '__', '___', 'clusterMemDataName', 'cvTrainingFoldMaxMatchCount', 'cvTrainingFoldMinMatchCount', 'getQuerySeriesResults', 'getQuerySuccessRate', 'getSearchParameters', '_e763e0cadefce90220124bd01fdc2bb2_46673_23_f', '_e763e0cadefce90220124bd01fdc2bb2_46673_23_args', '_e763e0cadefce90220124bd01fdc2bb2_46673_23_kwargs', '_e763e0cadefce90220124bd01fdc2bb2_46673_23_result']
So it seems the global never arrives or is accessible on the engine.
In case it's relevant, I am invoking the engines that require the data using lview.imap:
for i, result in enumerate(
lview.imap(parallelTrainTestQuery, nextQueryIndices, ordered = False, max_outstanding = 'auto')
):
results[i] = result
And I make sure that the clusterMemDataName
variable is available using dview.push prior to that:
# push shared functions and settings to child processes
dview.push(dict(
getSearchParameters = getSearchParameters,
getQuerySeriesResults = getQuerySeriesResults,
getQuerySuccessRate = getQuerySuccessRate,
cvTrainingFoldMinMatchCount = cvTrainingFoldMinMatchCount,
cvTrainingFoldMaxMatchCount = cvTrainingFoldMaxMatchCount,
clusterMemDataName = clusterMemDataName
), block = True)
I'm on 8.4.1.
I've since simplified things a lot to try and narrow down the issue. This is my simplified code:
import numpy as np
import os
import ipyparallel as ipp
clusterMemDataName = 'data'
clusterMemFilePath = './cache/memfile.npy'
# the number of processes we'll spawn
clusterProcessesCount = os.cpu_count()
cluster = ipp.Cluster(n = clusterProcessesCount)
cluster.start_cluster_sync()
rc = cluster.connect_client_sync()
rc.wait_for_engines(clusterProcessesCount)
lview = rc.load_balanced_view()
dview = rc[:]
engine_hosts = dview.apply_async(socket.gethostname).get_dict()
host_engines = {}
for eid, host in engine_hosts.items():
if host not in host_engines:
host_engines[host] = []
host_engines[host].append(eid)
@ipp.interactive
def array_to_file(data, path):
import numpy as np
np.save(path, data)
@ipp.interactive
def load_memmap(name, path, mode = 'r+'):
import numpy as np
globals()[name] = np.memmap(path, mode = mode)
def bcast_memmap(data, name, memFilePath, client, host_engines):
one_per_host = client.broadcast_view([engines[0] for engines in host_engines.values()], is_coalescing = True)
one_per_host.apply_sync(array_to_file, data, path = memFilePath)
# load the data on all engines into a memmapped array
e_all = client.broadcast_view(is_coalescing = True)
e_all.apply_sync(load_memmap, name, path = memFilePath)
@ipp.interactive
def parallelTrainTestQuery(queryIndex):
import numpy as np
qois = globals()[clusterMemDataName]
# I don't know how to log from an engine so right now I am using an exception to see the data
raise Exception('qois is: ' + qois)
dview.push(dict(
clusterMemDataName = clusterMemDataName
), block = True)
bcast_memmap(
qois,
clusterMemDataName,
clusterMemFilePath,
rc,
host_engines
)
for i, result in enumerate(
lview.imap(parallelTrainTestQuery, nextQueryIndices, ordered = False, max_outstanding = 'auto')
):
print(result)
Strangely while the global is now available in the engine, the value seems corrupted. When I try to print it (through the exception) I get UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U34'), dtype('uint8')) -> None
When I try to access it directly it seems to be an uint8 where it should be an array of float64.
I had to make a few changes to get this to work:
@ipp.interactive
def load_memmap(name, path, mode = 'r+'):
import numpy as np
globals()[name] = np.load(path, mmap_mode = mode, allow_pickle = True)
and finally I had to broadcast each array individually to preserve types:
bcast_memmap(qois, 'qois', f'{clusterMemFilePathPrefix}/qois.npy'), rc, host_engines)
bcast_memmap(trainingSampleSeries, 'trainingSampleSeries', f'{clusterMemFilePathPrefix}/trainingSampleSeries.npy'), rc, host_engines)
bcast_memmap(testingSampleSeries, 'testingSampleSeries', f'{clusterMemFilePathPrefix}/testingSampleSeries.npy'), rc, host_engines)
bcast_memmap(sampleAdditions, 'sampleAdditions', f'{clusterMemFilePathPrefix}/sampleAdditions.npy'), rc, host_engines)