ERROR - IndexError: list index out of range
meharc opened this issue · comments
Discussed in #2075
Originally posted by meharc March 19, 2024
I replicated an example code, identical to the one provided in this repository, within a Databricks notebook to explore Splink.
from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl
from splink.duckdb.blocking_rule_library import block_on
from splink.datasets import splink_datasets
df = splink_datasets.fake_1000
settings = {
"link_type": "dedupe_only",
"blocking_rules_to_generate_predictions": [
block_on("first_name"),
block_on("surname"),
],
"comparisons": [
ctl.name_comparison("first_name"),
ctl.name_comparison("surname"),
ctl.date_comparison("dob", cast_strings_to_date=True),
cl.exact_match("city", term_frequency_adjustments=True),
ctl.email_comparison("email", include_username_fuzzy_level=False),
],
}
linker = DuckDBLinker(df, settings)
linker.estimate_u_using_random_sampling(max_pairs=1e6)
blocking_rule_for_training = block_on(["first_name", "surname"])
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
blocking_rule_for_training = block_on("dob")
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
pairwise_predictions = linker.predict()
clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
`clusters.as_pandas_dataframe(limit=5)```
This gives me the following error:
IndexError: list index out of range
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
File <command-1077583777744438>, line 29
25 linker.estimate_u_using_random_sampling(max_pairs=1e6)
27 blocking_rule_for_training = block_on(["first_name", "surname"])
---> 29 linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
31 blocking_rule_for_training = block_on("dob")
32 linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training, estimate_without_term_frequencies=True)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-56a9ba29-5210-4bf7-abfd-49580965d839/lib/python3.10/site-packages/splink/linker.py:1671, in Linker.estimate_parameters_using_expectation_maximisation(self, blocking_rule, comparisons_to_deactivate, comparison_levels_to_reverse_blocking_rule, estimate_without_term_frequencies, fix_probability_two_random_records_match, fix_m_probabilities, fix_u_probabilities, populate_probability_two_random_records_match_from_trained_values)
1659 if comparison_levels_to_reverse_blocking_rule is None:
1660 logger.warning(
1661 "\nWARNING: \n"
1662 "You have provided comparisons_to_deactivate but not "
(...)
1668 "as an exact match."
1669 )
-> 1671 em_training_session = EMTrainingSession(
1672 self,
1673 blocking_rule,
1674 fix_u_probabilities=fix_u_probabilities,
1675 fix_m_probabilities=fix_m_probabilities,
1676 fix_probability_two_random_records_match=fix_probability_two_random_records_match, # noqa 501
1677 comparisons_to_deactivate=comparisons_to_deactivate,
1678 comparison_levels_to_reverse_blocking_rule=comparison_levels_to_reverse_blocking_rule, # noqa 501
1679 estimate_without_term_frequencies=estimate_without_term_frequencies,
1680 )
1682 em_training_session._train()
1684 self._populate_m_u_from_trained_values()
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-56a9ba29-5210-4bf7-abfd-49580965d839/lib/python3.10/site-packages/splink/em_training_session.py:70, in EMTrainingSession.__init__(self, linker, blocking_rule_for_training, fix_u_probabilities, fix_m_probabilities, fix_probability_two_random_records_match, comparisons_to_deactivate, comparison_levels_to_reverse_blocking_rule, estimate_without_term_frequencies)
66 self._comparison_levels_to_reverse_blocking_rule = (
67 comparison_levels_to_reverse_blocking_rule
68 )
69 else:
---> 70 self._comparison_levels_to_reverse_blocking_rule = self._original_settings_obj._get_comparison_levels_corresponding_to_training_blocking_rule( # noqa
71 blocking_rule_for_training.blocking_rule_sql
72 )
74 self._settings_obj._probability_two_random_records_match = (
75 self._blocking_adjusted_probability_two_random_records_match
76 )
78 self._training_fix_u_probabilities = fix_u_probabilities
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-56a9ba29-5210-4bf7-abfd-49580965d839/lib/python3.10/site-packages/splink/settings.py:338, in Settings._get_comparison_levels_corresponding_to_training_blocking_rule(self, blocking_rule)
336 for cc in ccs:
337 for cl in cc.comparison_levels:
--> 338 if cl._is_exact_match:
339 exact_comparison_levels.append(cl)
341 # Where exact match on multiple columns exists, use that instead of individual
342 # exact match columns
343 # So for example, if we have a param estimate for exact match on first name AND
344 # surname, prefer that
345 # over individual estimtes for exact match first name and surname.
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-56a9ba29-5210-4bf7-abfd-49580965d839/lib/python3.10/site-packages/splink/comparison_level.py:503, in ComparisonLevel._is_exact_match(self)
501 exprs = _get_and_subclauses(sql_cnf)
502 for expr in exprs:
--> 503 if not _is_exact_match(expr):
504 return False
505 return True
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-56a9ba29-5210-4bf7-abfd-49580965d839/lib/python3.10/site-packages/splink/comparison_level.py:45, in _is_exact_match(sql_syntax_tree)
43 if type(subtree) is Identifier:
44 identifiers.append(subtree.this[:-2])
---> 45 if identifiers[0] == identifiers[1]:
46 return True
47 else:
IndexError: list index out of range
I've just copied and pasted the code from the tutorial located on the homepage of the repository. Do you have any suggestions or insights into why this might be occurring?
The cause is this:
#2072
The latest version of SQLGlot, which was just released, doesn't work with Splink.
For the moment, you need to make sure you have SQLGlot 22.x installed. i.e. install Splink, then run e.g. pip install sqlglot==22.5.0