I was unable to implement BERT trained on PubMed
dbarrau opened this issue · comments
All steps were carried out according to the documentation, and when attempted to train the model after compiling, I received this error:
Epoch 1/20
---------------------------------------------------------------------------
UnknownError Traceback (most recent call last)
[<ipython-input-98-f1f15d667bfd>](https://localhost:8080/#) in <module>
5 epochs=20,
6 callbacks=[model_checkpoint,
----> 7 early_stopping])
1 frames
[/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py](https://localhost:8080/#) in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
53 ctx.ensure_initialized()
54 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 55 inputs, attrs, num_outputs)
56 except core._NotOkStatusException as e:
57 if name is not None:
UnknownError: Graph execution error:
Detected at node 'cond/CudnnRNNV3' defined at (most recent call last):
File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 846, in launch_instance
app.start()
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelapp.py", line 612, in start
self.io_loop.start()
File "/usr/local/lib/python3.7/dist-packages/tornado/platform/asyncio.py", line 132, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
self._run_once()
File "/usr/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
handle._run()
File "/usr/lib/python3.7/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "/usr/local/lib/python3.7/dist-packages/tornado/ioloop.py", line 758, in _run_callback
ret = callback()
File "/usr/local/lib/python3.7/dist-packages/tornado/stack_context.py", line 300, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 1233, in inner
self.run()
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 1147, in run
yielded = self.gen.send(value)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 365, in process_one
yield gen.maybe_future(dispatch(*args))
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 545, in execute_request
user_expressions, allow_stdin,
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/ipkernel.py", line 306, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/zmqshell.py", line 536, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2855, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2881, in _run_cell
return runner(coro)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
coro.send(None)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3058, in run_cell_async
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3249, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-98-f1f15d667bfd>", line 7, in <module>
early_stopping])
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1409, in fit
tmp_logs = self.train_function(iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1051, in train_function
return step_function(self, iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1040, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1030, in run_step
outputs = model.train_step(data)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 889, in train_step
y_pred = self(x, training=True)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 490, in __call__
return super().__call__(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/functional.py", line 459, in call
inputs, training=training, mask=mask)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/functional.py", line 596, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/rnn/bidirectional.py", line 249, in __call__
return super(Bidirectional, self).__call__(inputs, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/rnn/bidirectional.py", line 365, in call
initial_state=forward_state, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/rnn/base_rnn.py", line 515, in __call__
return super(RNN, self).__call__(inputs, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/rnn/lstm.py", line 673, in call
runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/rnn/lstm.py", line 1184, in lstm_with_backend_selection
gru_lstm_utils.function_register(defun_gpu_lstm, **params)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/rnn/gru_lstm_utils.py", line 244, in function_register
concrete_func = func.get_concrete_function(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/rnn/lstm.py", line 1152, in gpu_lstm_with_fallback
false_fn=stardard_lstm_fn)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/rnn/lstm.py", line 1132, in cudnn_lstm_fn
return_sequences=return_sequences)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/rnn/lstm.py", line 1005, in gpu_lstm
time_major=time_major)
Node: 'cond/CudnnRNNV3'
Fail to find the dnn implementation.
[[{{node cond/CudnnRNNV3}}]]
[[tribrid_embedding_model/bidirectional_3/forward_lstm_3/PartitionedCall]] [Op:__inference_train_function_227513]
Something else I noted, after installing and importing tensorflow_text
, I was required to restart the runtime, and then models with bidirectional LSTM layers would not be able to train. In this next case, the traceback error is:
Saving TensorBoard log files to: SkimLit-logs/model_1_tok_emb_conv1d/20220902-121155
Epoch 1/3
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
[<ipython-input-48-9b6dc8cf5719>](https://localhost:8080/#) in <module>
6 validation_steps=int(0.1*len(valid_dataset)),
7 callbacks=[create_tensorboard_callback(SAVE_DIR,
----> 8 "model_1_tok_emb_conv1d")])
1 frames
[/usr/local/lib/python3.7/dist-packages/tensorflow/python/eager/execute.py](https://localhost:8080/#) in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
53 ctx.ensure_initialized()
54 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 55 inputs, attrs, num_outputs)
56 except core._NotOkStatusException as e:
57 if name is not None:
UnimplementedError: Graph execution error:
Detected at node 'model_1_dense/conv_1D_layer/Conv1D' defined at (most recent call last):
File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 846, in launch_instance
app.start()
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelapp.py", line 612, in start
self.io_loop.start()
File "/usr/local/lib/python3.7/dist-packages/tornado/platform/asyncio.py", line 132, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
self._run_once()
File "/usr/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
handle._run()
File "/usr/lib/python3.7/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "/usr/local/lib/python3.7/dist-packages/tornado/ioloop.py", line 758, in _run_callback
ret = callback()
File "/usr/local/lib/python3.7/dist-packages/tornado/stack_context.py", line 300, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 1233, in inner
self.run()
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 1147, in run
yielded = self.gen.send(value)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 381, in dispatch_queue
yield self.process_one()
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 346, in wrapper
runner = Runner(result, future, yielded)
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 1080, in __init__
self.run()
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 1147, in run
yielded = self.gen.send(value)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 365, in process_one
yield gen.maybe_future(dispatch(*args))
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/kernelbase.py", line 545, in execute_request
user_expressions, allow_stdin,
File "/usr/local/lib/python3.7/dist-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/ipkernel.py", line 306, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.7/dist-packages/ipykernel/zmqshell.py", line 536, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2855, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2881, in _run_cell
return runner(coro)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
coro.send(None)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3058, in run_cell_async
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3249, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-48-9b6dc8cf5719>", line 8, in <module>
"model_1_tok_emb_conv1d")])
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1409, in fit
tmp_logs = self.train_function(iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1051, in train_function
return step_function(self, iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1040, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1030, in run_step
outputs = model.train_step(data)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 889, in train_step
y_pred = self(x, training=True)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 490, in __call__
return super().__call__(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/functional.py", line 459, in call
inputs, training=training, mask=mask)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/functional.py", line 596, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/convolutional/base_conv.py", line 250, in call
outputs = self.convolution_op(inputs, self.kernel)
File "/usr/local/lib/python3.7/dist-packages/keras/layers/convolutional/base_conv.py", line 232, in convolution_op
name=self.__class__.__name__)
Node: 'model_1_dense/conv_1D_layer/Conv1D'
DNN library is not found.
[[{{node model_1_dense/conv_1D_layer/Conv1D}}]] [Op:__inference_train_function_12429]
Anyone having these issues within Colab? Thanks
You can see the error accessing my colab, but it has lots there. It will take some time to run and reach to the latest part.
https://colab.research.google.com/drive/1wRbbEeLDmD4gNjlWvCmxYtDxc8lliaNw?usp=sharing
Hey, @dbarrau ,
I experienced this (apparently some version incompatibility) and I solved it with the following
# Check libcudnn8 version
!apt-cache policy libcudnn8
# Install latest version
!apt install --allow-change-held-packages libcudnn8=8.4.1.50-1+cuda11.6
# Export env variables
!export PATH=/usr/local/cuda-11.4/bin${PATH:+:${PATH}}
!export LD_LIBRARY_PATH=/usr/local/cuda-11.4/lib64:$LD_LIBRARY_PATH
!export LD_LIBRARY_PATH=/usr/local/cuda-11.4/include:$LD_LIBRARY_PATH
!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64
# Install tensorflow
!pip install tflite-model-maker==0.4.0
# !pip uninstall -y tensorflow && pip install -q tensorflow==2.9.1
!pip install pycocotools==2.0.4
!pip install opencv-python-headless==4.6.0.66