File could not be opened for reading with mpi and FileStorage
yohad opened this issue · comments
Hello, I have a simulation I run with mpiexec -n 16 python py_pde_test.py
every other time I run it I get the error File data\storage-1108_191954\ could not be opened for reading
from some of the mpi threads.
The code:
import numpy as np
import numba as nb
import pde
from pde.tools import mpi
from pde.solvers import Controller, ExplicitMPISolver
import time
class ModelPDE(pde.PDEBase):
def __init__(self, bc="auto_periodic_neumann", nu=10 / 3, eta=3.5, rho=0.95, gamma=50 / 3, delta_b=1 / 30,
delta_w=10 / 3, delta_h=1000 / 3, a=33.33, q=0.05, f=0.1, p=0.5):
self._nu = nu
self._eta = eta
self._rho = rho
self._gamma = gamma
self._delta_b = delta_b
self._delta_w = delta_w
self._delta_h = delta_h
self._a = a
self._q = q
self._f = f
self._p = p
self._bc = bc
def _make_pde_rhs_numba(self, state):
a = self._a
q = self._q
f = self._f
eta = self._eta
nu = self._nu
rho = self._rho
gamma = self._gamma
delta_b = self._delta_b
delta_w = self._delta_w
delta_h = self._delta_h
p = self._p
laplace = state.grid.make_operator("laplace", bc=self._bc)
div = state.grid.make_operator("divergence", bc=self._bc)
grad = state.grid.make_operator("gradient", bc=self._bc)
zeta = np.fromfunction(lambda x, _: x/32, state.grid.shape) # TODO: this is bad
@nb.jit
def pde_rhs(state_data, t):
# state
b = state_data[0]
w = state_data[1]
h = state_data[2]
rate = np.empty_like(state_data)
# Calculate constants
I = a * (b + q * f) / (b + q) # Infiltration Rate
L2 = np.float_power(1 + eta * b, 2)
Gb = nu * w * L2
Gw = gamma * b * L2
# Calculate time derivatives
rate[0] = Gb * b * (1 - b) - b + delta_b * laplace(b)
rate[1] = I * h - nu * (1 - rho * b) * w - Gw * w + delta_w * laplace(w)
J = -2*delta_h*h*grad(h+zeta)
rate[2] = p - I * h - div(J)
return rate
return pde_rhs
def evolution_rate(self, state: pde.FieldBase, t: float = 0) -> pde.FieldBase:
b = state[0]
w = state[1]
h = state[2]
zeta = np.fromfunction(lambda x, _: x/32, state.grid.shape) # TODO: this is bad
# Calculate constants
I = self._a * (b + self._q * self._f) / (b + self._q) # Infiltration Rate
L2 = np.float_power(1 + self._eta * b, 2)
Gb = self._nu * w * L2
Gw = self._gamma * b * L2
# Calculate time derivatives
b_t = Gb * b * (1 - b) - b + self._delta_b * b.laplace(bc=self._bc)
w_t = I * h - self._nu * (1 - self._rho * b) * w - Gw * w + self._delta_w * w.laplace(bc=self._bc)
J = -2*self._delta_h*h *(h + zeta).gradient(bc=self._bc)
h_t = self._p - I * h - J.divergence(bc=self._bc)
return pde.FieldCollection([b_t, w_t, h_t])
def terrain(coords):
return coords[:, :, 0] / 32
def main():
shape = (128, 128)
# grid = pde.UnitGrid([128, 128], periodic=[True, True])
grid = pde.CartesianGrid([(0, 32), (0, 32)], shape, periodic=[True, False])
b = pde.ScalarField(grid, 1)
w = pde.ScalarField(grid, 0)
h = pde.ScalarField(grid, 0)
state = pde.FieldCollection([b, w, h])
bc_x = "periodic"
bc_y = [{"value": 0} ,{"curvature": 0}]
bc = [bc_x, bc_y]
years = 2
t = time.localtime()
timestamp = time.strftime("%m%d_%H%M%S", t)
BACKUP_NAME = "data\storage-" + timestamp
storage = pde.FileStorage(BACKUP_NAME, write_mode="append")
eq = ModelPDE(p=0.4, bc=bc)
solver = ExplicitMPISolver(eq, backend="numba")
controller = Controller(solver, t_range=years, tracker=["progress", storage.tracker(1e-1)])
sol = controller.run(state, dt=1e-4)
if mpi.is_main:
VIDEO_NAME = f"results\movie-{timestamp}.mp4"
pde.movie(storage, filename=VIDEO_NAME, plot_args={"vmin": 0, "vmax": 1})
if __name__ == '__main__':
main()
Thank you for the report! I can't quite reproduce the problem, but given that you also don't see the problem all the time, this might not be surprising. In any case, I did not carefully check how the FileStorage
interacts with the ExplicitMPISolver
, so it is entirely possible that there are race conditions or other IO related problems.
To circumvent this, I just implemented a check that simply does not touch files unless we're on the main process. Could you please check whether the problem still occurs with the latest master
branch?
This seems to have solved the problem, thanks 😄