test_replace_active_node fails with std::runtime_error (the topology coordinator rejected request to join the cluster: tried to replace alive node 354e86b3-2120-49b8-8b8e-317820be5a5a)'])
Deexie opened this issue · comments
request = <SubRequest 'fixture_dtest_setup' for <Function test_replace_active_node[use_endpoint-rbo_disabled]>>
dtest_config = <dtest_config.DTestConfig object at 0x7f37e3a63d90>
fixture_dtest_setup_overrides = <dtest_setup_overrides.DTestSetupOverrides object at 0x7f37889c19d0>
fixture_logging_setup = None, fixture_dtest_cluster_name = 'test'
fixture_dtest_create_cluster_func = <function DTestSetup.create_ccm_cluster at 0x7f37e835dee0>
@pytest.fixture(scope="function", autouse=False)
def fixture_dtest_setup( # noqa: PLR0912, PLR0913
request,
dtest_config,
fixture_dtest_setup_overrides,
fixture_logging_setup,
fixture_dtest_cluster_name,
fixture_dtest_create_cluster_func,
):
# do all of our setup operations to get the enviornment ready for the actual test
# to run (e.g. bring up a cluster with the necessary config, populate variables, etc)
initial_environment = copy.deepcopy(os.environ)
dtest_setup = DTestSetup(dtest_config=dtest_config, setup_overrides=fixture_dtest_setup_overrides, cluster_name=fixture_dtest_cluster_name)
cassandra_cluster = dtest_config.cassandra_version
if not cassandra_cluster:
if request.node.get_closest_marker("single_node") or not request.node.get_closest_marker("no_boot_speedups"):
dtest_setup.cluster_options.setdefault("skip_wait_for_gossip_to_settle", 0)
# Reduce waiting time for the nodes to hear from others before joining the ring.
# Since all test cases run on localhost and there are no large test clusters
# it's safe to reduce the value to save a lot of time while testing.
# (Default value for the option is 30s)
dtest_setup.cluster_options.setdefault("ring_delay_ms", 10000)
cluster_options = request.node.get_closest_marker("cluster_options")
if cluster_options:
for name, value in cluster_options.kwargs.items():
dtest_setup.cluster_options.setdefault(name, value)
manager_install_dir = dtest_setup.prepare_scylla_manager() if request.node.get_closest_marker("scylla_manager") else None
dtest_setup.initialize_cluster(fixture_dtest_create_cluster_func, manager_install_dir=manager_install_dir)
# at this point we're done with our setup operations in this fixture
# yield to allow the actual test to run
yield dtest_setup
# phew! we're back after executing the test, now we need to do
# all of our teardown and cleanup operations
reset_environment_vars(initial_environment)
dtest_setup.jvm_args = []
for con in dtest_setup.connections:
con.cluster.shutdown()
dtest_setup.connections = []
rep_setup = getattr(request.node, "rep_setup", None)
rep_call = getattr(request.node, "rep_call", None)
failed = getattr(rep_setup, "failed", False) or getattr(rep_call, "failed", False)
try:
dtest_setup.cluster.stop(gently=True)
except Exception as e: # noqa: BLE001
logger.error("Error stopping cluster: %s", str(e))
try:
if not dtest_setup.allow_log_errors:
exclude_errors = []
marker = request.node.get_closest_marker("exclude_errors")
if marker:
exclude_errors = list(marker.args)
try:
> dtest_setup.check_errors_all_nodes(exclude_errors=exclude_errors)
conftest.py:322:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <dtest_setup.DTestSetup object at 0x7f37537cdf90>
nodes = [<ccmlib.scylla_node.ScyllaNode object at 0x7f37e0421390>, <ccmlib.scylla_node.ScyllaNode object at 0x7f37e0421e50>, <ccmlib.scylla_node.ScyllaNode object at 0x7f37e049c8d0>, <ccmlib.scylla_node.ScyllaNode object at 0x7f37e025e350>]
exclude_errors = [], search_str = None, regex = False
def check_errors_all_nodes(self, nodes=None, exclude_errors=None, search_str=None, regex=False):
if nodes is None:
nodes = self.cluster.nodelist()
critical_errors = []
found_errors = []
for node in nodes:
try:
critical_errors_pattern = r"Assertion.*failed|AddressSanitizer"
if self.ignore_cores_log_patterns:
expr = "|".join([f"({p})" for p in set(self.ignore_cores_log_patterns)])
matches = node.grep_log(expr)
if matches:
logger.debug(f"Will ignore cores on {node.name}. Found the following log messages: {matches}")
self.ignore_cores.append(node)
if node not in self.ignore_cores:
critical_errors_pattern += "|Aborting"
matches = node.grep_log(critical_errors_pattern)
if matches:
critical_errors.append((node.name, [m[0].strip() for m in matches]))
except FileNotFoundError:
pass
logger.debug(f"exclude_errors: {exclude_errors}")
errors = self.check_errors(node=node, exclude_errors=exclude_errors, search_str=search_str, regex=regex, return_errors=True)
if len(errors):
found_errors.append((node.name, errors))
if critical_errors:
raise AssertionError(f"Critical errors found: {critical_errors}\nOther errors: {found_errors}")
if found_errors:
> raise AssertionError(f"Unexpected errors found: {found_errors}")
E AssertionError: Unexpected errors found: [('node4', ['ERROR 2024-05-08 23:07:49,545 [shard 0:main] init - Startup failed: std::runtime_error (the topology coordinator rejected request to join the cluster: tried to replace alive node 354e86b3-2120-49b8-8b8e-317820be5a5a)'])]
dtest_setup.py:636: AssertionError
@gleb-cloudius - are you familiar with it?
In the test_replace_active_node
test such error is expected and suppose to be filtered out by the test if consistent-topology-changes
is enabled. Probably a fallout from making consistent topology default @patjed41.
This test is fine. It failed in gating because we had a problem in CI for a moment, which also impacted other PRs, for example #18569 (comment). The test ran with the raft-based topology, but it didn't know about it. This part of the test ignored a wrong message:
if "consistent-topology-changes" in self.scylla_features:
expected_message = f"tried to replace alive node {node3_hostid}"
else:
expected_message = "Cannot replace a live node"
self.ignore_log_patterns += [expected_message]
We can close this issue.
Other failures in https://jenkins.scylladb.com/job/scylla-master/job/gating-dtest-release-with-consistent-topology-changes/1459/ were caused by the same problem.