Notebook test sometimes fails
ADBond opened this issue · comments
Example notebook sometimes fails in sqlite, see this run. Something random as passed on re-run.
I have a feeling this has happened before also. Not massively urgent as not happening frequently, and can re-run the job, but would be nice to get to the bottom of why it is happening.
For reference:
Click for full output
============================= test session starts ==============================
platform linux -- Python 3.9.19, pytest-8.0.0, pluggy-1.5.0 -- /home/runner/work/splink/splink/.venv/bin/python
cachedir: .pytest_cache
rootdir: /home/runner/work/splink/splink
configfile: pyproject.toml
plugins: anyio-4.3.0, nbmake-1.5.0, xdist-3.5.0
created: 2/2 workers
2 workers [1 item]
scheduling tests via LoadScheduling
docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
[gw0] [100%] FAILED docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
=================================== FAILURES ===================================
_ /home/runner/work/splink/splink/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb _
[gw0] linux -- Python 3.9.19 /home/runner/work/splink/splink/.venv/bin/python
---------------------------------------------------------------------------
df_predict = linker.predict()
df_e = df_predict.as_pandas_dataframe(limit=5)
df_e
---------------------------------------------------------------------------
OperationalError Traceback (most recent call last)
File ~/work/splink/splink/splink/database_api.py:64, in DatabaseAPI._log_and_run_sql_execution(self, final_sql, templated_name, physical_name)
63 try:
---> 64 return self._execute_sql_against_backend(final_sql)
65 except Exception as e:
66 # Parse our SQL through sqlglot to pretty print
File ~/work/splink/splink/splink/sqlite/database_api.py:103, in SQLiteAPI._execute_sql_against_backend(self, final_sql)
102 def _execute_sql_against_backend(self, final_sql: str) -> sqlite3.Cursor:
--> 103 return self.con.execute(final_sql)
OperationalError: user-defined function raised exception
The above exception was the direct cause of the following exception:
SplinkException Traceback (most recent call last)
Cell In[12], line 1
----> 1 df_predict = linker.predict()
2 df_e = df_predict.as_pandas_dataframe(limit=5)
3 df_e
File ~/work/splink/splink/splink/linker.py:1143, in Linker.predict(self, threshold_match_probability, threshold_match_weight, materialise_after_computing_term_frequencies)
1135 sqls = predict_from_comparison_vectors_sqls_using_settings(
1136 self._settings_obj,
1137 threshold_match_probability,
1138 threshold_match_weight,
1139 sql_infinity_expression=self._infinity_expression,
1140 )
1141 pipeline.enqueue_list_of_sqls(sqls)
-> 1143 predictions = self.db_api.sql_pipeline_to_splink_dataframe(pipeline)
1144 self._predict_warning()
1146 [b.drop_materialised_id_pairs_dataframe() for b in exploding_br_with_id_tables]
File ~/work/splink/splink/splink/database_api.py:203, in DatabaseAPI.sql_pipeline_to_splink_dataframe(self, pipeline, use_cache)
200 sql_gen = pipeline.generate_cte_pipeline_sql()
201 output_tablename_templated = pipeline.output_table_name
--> 203 splink_dataframe = self.sql_to_splink_dataframe_checking_cache(
204 sql_gen,
205 output_tablename_templated,
206 use_cache,
207 )
208 else:
209 # In debug mode, we do not pipeline the sql and print the
210 # results of each part of the pipeline
211 for cte in pipeline.ctes_pipeline():
File ~/work/splink/splink/splink/database_api.py:174, in DatabaseAPI.sql_to_splink_dataframe_checking_cache(self, sql, output_tablename_templated, use_cache)
171 print(df_pd) # noqa: T201
173 else:
--> 174 splink_dataframe = self._sql_to_splink_dataframe(
175 sql, output_tablename_templated, table_name_hash
176 )
178 splink_dataframe.created_by_splink = True
179 splink_dataframe.sql_used_to_create = sql
File ~/work/splink/splink/splink/database_api.py:95, in DatabaseAPI._sql_to_splink_dataframe(self, sql, templated_name, physical_name)
87 """
88 Create a table in the backend using some given sql
89
(...)
92 Returns a SplinkDataFrame which also uses templated_name
93 """
94 sql = self._setup_for_execute_sql(sql, physical_name)
---> 95 spark_df = self._log_and_run_sql_execution(sql, templated_name, physical_name)
96 output_df = self._cleanup_for_execute_sql(
97 spark_df, templated_name, physical_name
98 )
99 self._intermediate_table_cache.executed_queries.append(output_df)
File ~/work/splink/splink/splink/database_api.py:76, in DatabaseAPI._log_and_run_sql_execution(self, final_sql, templated_name, physical_name)
73 except Exception:
74 pass
---> 76 raise SplinkException(
77 f"Error executing the following sql for table "
78 f"`{templated_name}`({physical_name}):\n{final_sql}"
79 f"\n\nError was: {e}"
80 ) from e
SplinkException: Error executing the following sql for table `__splink__df_predict`(__splink__df_predict_2a9a4ba75):
CREATE TABLE __splink__df_predict_2a9a4ba75 AS
WITH
__splink__df_concat_with_tf as (
select * from __splink__df_concat_with_tf_b6af2d7ee),
__splink__df_blocked as (
select
"l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."first_name" AS "first_name_l", "r"."first_name" AS "first_name_r", "l"."tf_first_name" AS "tf_first_name_l", "r"."tf_first_name" AS "tf_first_name_r", "l"."surname" AS "surname_l", "r"."surname" AS "surname_r", "l"."tf_surname" AS "tf_surname_l", "r"."tf_surname" AS "tf_surname_r", "l"."dob" AS "dob_l", "r"."dob" AS "dob_r", "l"."tf_dob" AS "tf_dob_l", "r"."tf_dob" AS "tf_dob_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."tf_birth_place" AS "tf_birth_place_l", "r"."tf_birth_place" AS "tf_birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r", "l"."tf_occupation" AS "tf_occupation_l", "r"."tf_occupation" AS "tf_occupation_r"
, '0' as match_key
from __splink__df_concat_with_tf as l
inner join __splink__df_concat_with_tf as r
on
(l.first_name = r.first_name and l.surname = r.surname)
where l."unique_id" < r."unique_id"
UNION ALL
select
"l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."first_name" AS "first_name_l", "r"."first_name" AS "first_name_r", "l"."tf_first_name" AS "tf_first_name_l", "r"."tf_first_name" AS "tf_first_name_r", "l"."surname" AS "surname_l", "r"."surname" AS "surname_r", "l"."tf_surname" AS "tf_surname_l", "r"."tf_surname" AS "tf_surname_r", "l"."dob" AS "dob_l", "r"."dob" AS "dob_r", "l"."tf_dob" AS "tf_dob_l", "r"."tf_dob" AS "tf_dob_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."tf_birth_place" AS "tf_birth_place_l", "r"."tf_birth_place" AS "tf_birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r", "l"."tf_occupation" AS "tf_occupation_l", "r"."tf_occupation" AS "tf_occupation_r"
, '1' as match_key
from __splink__df_concat_with_tf as l
inner join __splink__df_concat_with_tf as r
on
(l.surname = r.surname and l.dob = r.dob)
where l."unique_id" < r."unique_id"
AND NOT (coalesce((l.first_name = r.first_name and l.surname = r.surname),false))
UNION ALL
select
"l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."first_name" AS "first_name_l", "r"."first_name" AS "first_name_r", "l"."tf_first_name" AS "tf_first_name_l", "r"."tf_first_name" AS "tf_first_name_r", "l"."surname" AS "surname_l", "r"."surname" AS "surname_r", "l"."tf_surname" AS "tf_surname_l", "r"."tf_surname" AS "tf_surname_r", "l"."dob" AS "dob_l", "r"."dob" AS "dob_r", "l"."tf_dob" AS "tf_dob_l", "r"."tf_dob" AS "tf_dob_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."tf_birth_place" AS "tf_birth_place_l", "r"."tf_birth_place" AS "tf_birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r", "l"."tf_occupation" AS "tf_occupation_l", "r"."tf_occupation" AS "tf_occupation_r"
, '2' as match_key
from __splink__df_concat_with_tf as l
inner join __splink__df_concat_with_tf as r
on
(l.first_name = r.first_name and l.dob = r.dob)
where l."unique_id" < r."unique_id"
AND NOT (coalesce((l.first_name = r.first_name and l.surname = r.surname),false) OR coalesce((l.surname = r.surname and l.dob = r.dob),false))
UNION ALL
select
"l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."first_name" AS "first_name_l", "r"."first_name" AS "first_name_r", "l"."tf_first_name" AS "tf_first_name_l", "r"."tf_first_name" AS "tf_first_name_r", "l"."surname" AS "surname_l", "r"."surname" AS "surname_r", "l"."tf_surname" AS "tf_surname_l", "r"."tf_surname" AS "tf_surname_r", "l"."dob" AS "dob_l", "r"."dob" AS "dob_r", "l"."tf_dob" AS "tf_dob_l", "r"."tf_dob" AS "tf_dob_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."tf_birth_place" AS "tf_birth_place_l", "r"."tf_birth_place" AS "tf_birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r", "l"."tf_occupation" AS "tf_occupation_l", "r"."tf_occupation" AS "tf_occupation_r"
, '3' as match_key
from __splink__df_concat_with_tf as l
inner join __splink__df_concat_with_tf as r
on
(l.postcode_fake = r.postcode_fake and l.first_name = r.first_name)
where l."unique_id" < r."unique_id"
AND NOT (coalesce((l.first_name = r.first_name and l.surname = r.surname),false) OR coalesce((l.surname = r.surname and l.dob = r.dob),false) OR coalesce((l.first_name = r.first_name and l.dob = r.dob),false))
),
__splink__df_comparison_vectors as (
select "unique_id_l","unique_id_r","first_name_l","first_name_r",CASE WHEN "first_name_l" IS NULL OR "first_name_r" IS NULL THEN -1 WHEN "first_name_l" = "first_name_r" THEN 2 WHEN jaro_winkler("first_name_l", "first_name_r") >= 0.9 THEN 1 ELSE 0 END as gamma_first_name,"tf_first_name_l","tf_first_name_r","surname_l","surname_r",CASE WHEN "surname_l" IS NULL OR "surname_r" IS NULL THEN -1 WHEN "surname_l" = "surname_r" THEN 2 WHEN jaro_winkler("surname_l", "surname_r") >= 0.9 THEN 1 ELSE 0 END as gamma_surname,"tf_surname_l","tf_surname_r","dob_l","dob_r",CASE WHEN "dob_l" IS NULL OR "dob_r" IS NULL THEN -1 WHEN "dob_l" = "dob_r" THEN 3 WHEN damerau_levenshtein("dob_l", "dob_r") <= 1 THEN 2 WHEN damerau_levenshtein("dob_l", "dob_r") <= 2 THEN 1 ELSE 0 END as gamma_dob,"tf_dob_l","tf_dob_r","postcode_fake_l","postcode_fake_r",CASE WHEN "postcode_fake_l" IS NULL OR "postcode_fake_r" IS NULL THEN -1 WHEN "postcode_fake_l" = "postcode_fake_r" THEN 3 WHEN damerau_levenshtein("postcode_fake_l", "postcode_fake_r") <= 1 THEN 2 WHEN damerau_levenshtein("postcode_fake_l", "postcode_fake_r") <= 2 THEN 1 ELSE 0 END as gamma_postcode_fake,"birth_place_l","birth_place_r",CASE WHEN "birth_place_l" IS NULL OR "birth_place_r" IS NULL THEN -1 WHEN "birth_place_l" = "birth_place_r" THEN 1 ELSE 0 END as gamma_birth_place,"tf_birth_place_l","tf_birth_place_r","occupation_l","occupation_r",CASE WHEN "occupation_l" IS NULL OR "occupation_r" IS NULL THEN -1 WHEN "occupation_l" = "occupation_r" THEN 1 ELSE 0 END as gamma_occupation,"tf_occupation_l","tf_occupation_r",match_key
from __splink__df_blocked
),
__splink__df_match_weight_parts as (
select "unique_id_l","unique_id_r","first_name_l","first_name_r",gamma_first_name,"tf_first_name_l","tf_first_name_r",CASE
WHEN
gamma_first_name = -1
THEN cast(1.0 as float8)
WHEN
gamma_first_name = 2
THEN cast(22.538194655324588 as float8)
WHEN
gamma_first_name = 1
THEN cast(8.398460821344122e-10 as float8)
WHEN
gamma_first_name = 0
THEN cast(0.9595244959440041 as float8)
END as bf_first_name ,CASE WHEN gamma_first_name = -1 then cast(1 as float8) WHEN gamma_first_name = 2 then
(CASE WHEN coalesce("tf_first_name_l", "tf_first_name_r") is not null
THEN
POW(
cast(0.015050125313283208 as float8) /
(CASE
WHEN coalesce("tf_first_name_l", "tf_first_name_r") >= coalesce("tf_first_name_r", "tf_first_name_l")
THEN coalesce("tf_first_name_l", "tf_first_name_r")
ELSE coalesce("tf_first_name_r", "tf_first_name_l")
END)
,
cast(1.0 as float8)
)
ELSE cast(1 as float8)
END) WHEN gamma_first_name = 1 then cast(1 as float8) WHEN gamma_first_name = 0 then cast(1 as float8) END as bf_tf_adj_first_name ,"surname_l","surname_r",gamma_surname,"tf_surname_l","tf_surname_r",CASE
WHEN
gamma_surname = -1
THEN cast(1.0 as float8)
WHEN
gamma_surname = 2
THEN cast(829.7339196865737 as float8)
WHEN
gamma_surname = 1
THEN cast(2.347650066419875e-274 as float8)
WHEN
gamma_surname = 0
THEN cast(0.5640101767895112 as float8)
END as bf_surname ,CASE WHEN gamma_surname = -1 then cast(1 as float8) WHEN gamma_surname = 2 then
(CASE WHEN coalesce("tf_surname_l", "tf_surname_r") is not null
THEN
POW(
cast(0.0006437607605359682 as float8) /
(CASE
WHEN coalesce("tf_surname_l", "tf_surname_r") >= coalesce("tf_surname_r", "tf_surname_l")
THEN coalesce("tf_surname_l", "tf_surname_r")
ELSE coalesce("tf_surname_r", "tf_surname_l")
END)
,
cast(1.0 as float8)
)
ELSE cast(1 as float8)
END) WHEN gamma_surname = 1 then cast(1 as float8) WHEN gamma_surname = 0 then cast(1 as float8) END as bf_tf_adj_surname ,"dob_l","dob_r",gamma_dob,"tf_dob_l","tf_dob_r",CASE
WHEN
gamma_dob = -1
THEN cast(1.0 as float8)
WHEN
gamma_dob = 3
THEN cast(599.0777027027026 as float8)
WHEN
gamma_dob = 2
THEN cast(49.328752642706135 as float8)
WHEN
gamma_dob = 1
THEN cast(0.2054278922345485 as float8)
WHEN
gamma_dob = 0
THEN cast(1.1247139075244796e-34 as float8)
END as bf_dob ,CASE WHEN gamma_dob = -1 then cast(1 as float8) WHEN gamma_dob = 3 then
(CASE WHEN coalesce("tf_dob_l", "tf_dob_r") is not null
THEN
POW(
cast(0.001585770920390014 as float8) /
(CASE
WHEN coalesce("tf_dob_l", "tf_dob_r") >= coalesce("tf_dob_r", "tf_dob_l")
THEN coalesce("tf_dob_l", "tf_dob_r")
ELSE coalesce("tf_dob_r", "tf_dob_l")
END)
,
cast(1.0 as float8)
)
ELSE cast(1 as float8)
END) WHEN gamma_dob = 2 then cast(1 as float8) WHEN gamma_dob = 1 then cast(1 as float8) WHEN gamma_dob = 0 then cast(1 as float8) END as bf_tf_adj_dob ,"postcode_fake_l","postcode_fake_r",gamma_postcode_fake,CASE
WHEN
gamma_postcode_fake = -1
THEN cast(1.0 as float8)
WHEN
gamma_postcode_fake = 3
THEN cast(6939.673139652026 as float8)
WHEN
gamma_postcode_fake = 2
THEN cast(23793.00028541576 as float8)
WHEN
gamma_postcode_fake = 1
THEN cast(689.6354172768708 as float8)
WHEN
gamma_postcode_fake = 0
THEN cast(1.109823362613782e-64 as float8)
END as bf_postcode_fake ,"birth_place_l","birth_place_r",gamma_birth_place,"tf_birth_place_l","tf_birth_place_r",CASE
WHEN
gamma_birth_place = -1
THEN cast(1.0 as float8)
WHEN
gamma_birth_place = 1
THEN cast(219.43014705882354 as float8)
WHEN
gamma_birth_place = 0
THEN cast(6.648202339322792e-35 as float8)
END as bf_birth_place ,CASE WHEN gamma_birth_place = -1 then cast(1 as float8) WHEN gamma_birth_place = 1 then
(CASE WHEN coalesce("tf_birth_place_l", "tf_birth_place_r") is not null
THEN
POW(
cast(0.0045572589427829435 as float8) /
(CASE
WHEN coalesce("tf_birth_place_l", "tf_birth_place_r") >= coalesce("tf_birth_place_r", "tf_birth_place_l")
THEN coalesce("tf_birth_place_l", "tf_birth_place_r")
ELSE coalesce("tf_birth_place_r", "tf_birth_place_l")
END)
,
cast(1.0 as float8)
)
ELSE cast(1 as float8)
END) WHEN gamma_birth_place = 0 then cast(1 as float8) END as bf_tf_adj_birth_place ,"occupation_l","occupation_r",gamma_occupation,"tf_occupation_l","tf_occupation_r",CASE
WHEN
gamma_occupation = -1
THEN cast(1.0 as float8)
WHEN
gamma_occupation = 1
THEN cast(23.141810460746804 as float8)
WHEN
gamma_occupation = 0
THEN cast(0.1645048944611773 as float8)
END as bf_occupation ,CASE WHEN gamma_occupation = -1 then cast(1 as float8) WHEN gamma_occupation = 1 then
(CASE WHEN coalesce("tf_occupation_l", "tf_occupation_r") is not null
THEN
POW(
cast(0.036361752822956596 as float8) /
(CASE
WHEN coalesce("tf_occupation_l", "tf_occupation_r") >= coalesce("tf_occupation_r", "tf_occupation_l")
THEN coalesce("tf_occupation_l", "tf_occupation_r")
ELSE coalesce("tf_occupation_r", "tf_occupation_l")
END)
,
cast(1.0 as float8)
)
ELSE cast(1 as float8)
END) WHEN gamma_occupation = 0 then cast(1 as float8) END as bf_tf_adj_occupation ,match_key
from __splink__df_comparison_vectors
)
select
log2(cast(8.354916868577158e-05 as float8) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_tf_adj_surname * bf_dob * bf_tf_adj_dob * bf_postcode_fake * bf_birth_place * bf_tf_adj_birth_place * bf_occupation * bf_tf_adj_occupation) as match_weight,
CASE WHEN bf_first_name = 'infinity' OR bf_tf_adj_first_name = 'infinity' OR bf_surname = 'infinity' OR bf_tf_adj_surname = 'infinity' OR bf_dob = 'infinity' OR bf_tf_adj_dob = 'infinity' OR bf_postcode_fake = 'infinity' OR bf_birth_place = 'infinity' OR bf_tf_adj_birth_place = 'infinity' OR bf_occupation = 'infinity' OR bf_tf_adj_occupation = 'infinity' THEN 1.0 ELSE (cast(8.354916868577158e-05 as float8) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_tf_adj_surname * bf_dob * bf_tf_adj_dob * bf_postcode_fake * bf_birth_place * bf_tf_adj_birth_place * bf_occupation * bf_tf_adj_occupation)/(1+(cast(8.354916868577158e-05 as float8) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_tf_adj_surname * bf_dob * bf_tf_adj_dob * bf_postcode_fake * bf_birth_place * bf_tf_adj_birth_place * bf_occupation * bf_tf_adj_occupation)) END as match_probability,
"unique_id_l","unique_id_r","first_name_l","first_name_r",gamma_first_name,"tf_first_name_l","tf_first_name_r",bf_first_name,bf_tf_adj_first_name,"surname_l","surname_r",gamma_surname,"tf_surname_l","tf_surname_r",bf_surname,bf_tf_adj_surname,"dob_l","dob_r",gamma_dob,"tf_dob_l","tf_dob_r",bf_dob,bf_tf_adj_dob,"postcode_fake_l","postcode_fake_r",gamma_postcode_fake,bf_postcode_fake,"birth_place_l","birth_place_r",gamma_birth_place,"tf_birth_place_l","tf_birth_place_r",bf_birth_place,bf_tf_adj_birth_place,"occupation_l","occupation_r",gamma_occupation,"tf_occupation_l","tf_occupation_r",bf_occupation,bf_tf_adj_occupation,match_key
from __splink__df_match_weight_parts
Error was: user-defined function raised exception
============================== slowest durations ===============================
7.79s call docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
0.00s setup docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
0.00s teardown docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
=========================== short test summary info ============================
FAILED docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
============================== 1 failed in 8.36s ===============================
Error: Process completed with exit code 1.
See also: this run
Run logs
============================= test session starts ==============================
platform linux -- Python 3.9.19, pytest-8.0.0, pluggy-1.5.0 -- /home/runner/work/splink/splink/.venv/bin/python
cachedir: .pytest_cache
rootdir: /home/runner/work/splink/splink
configfile: pyproject.toml
plugins: anyio-4.3.0, nbmake-1.5.0, xdist-3.5.0
created: 2/2 workers
2 workers [1 item]
scheduling tests via LoadScheduling
docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
[gw0] [100%] FAILED docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
=================================== FAILURES ===================================
_ /home/runner/work/splink/splink/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb _
[gw0] linux -- Python 3.9.19 /home/runner/work/splink/splink/.venv/bin/python
---------------------------------------------------------------------------
training_blocking_rule = "l.dob = r.dob"
training_session_dob = linker.estimate_parameters_using_expectation_maximisation(
training_blocking_rule, estimate_without_term_frequencies=True
)
---------------------------------------------------------------------------
OperationalError Traceback (most recent call last)
File ~/work/splink/splink/splink/database_api.py:64, in DatabaseAPI._log_and_run_sql_execution(self, final_sql, templated_name, physical_name)
63 try:
---> 64 return self._execute_sql_against_backend(final_sql)
65 except Exception as e:
66 # Parse our SQL through sqlglot to pretty print
File ~/work/splink/splink/splink/sqlite/database_api.py:103, in SQLiteAPI._execute_sql_against_backend(self, final_sql)
102 def _execute_sql_against_backend(self, final_sql: str) -> sqlite3.Cursor:
--> 103 return self.con.execute(final_sql)
OperationalError: no such column: nan
The above exception was the direct cause of the following exception:
SplinkException Traceback (most recent call last)
Cell In[9], line 2
1 training_blocking_rule = "l.dob = r.dob"
----> 2 training_session_dob = linker.estimate_parameters_using_expectation_maximisation(
3 training_blocking_rule, estimate_without_term_frequencies=True
4 )
File ~/work/splink/splink/splink/linker.py:1027, in Linker.estimate_parameters_using_expectation_maximisation(self, blocking_rule, comparisons_to_deactivate, comparison_levels_to_reverse_blocking_rule, estimate_without_term_frequencies, fix_probability_two_random_records_match, fix_m_probabilities, fix_u_probabilities, populate_probability_two_random_records_match_from_trained_values)
1001 logger.warning(
1002 "\nWARNING: \n"
1003 "You have provided comparisons_to_deactivate but not "
(...)
1009 "as an exact match."
1010 )
1012 em_training_session = EMTrainingSession(
1013 self,
1014 db_api=self.db_api,
(...)
1024 estimate_without_term_frequencies=estimate_without_term_frequencies,
1025 )
-> 1027 core_model_settings = em_training_session._train()
1028 # overwrite with the newly trained values in our linker settings
1029 self._settings_obj.core_model_settings = core_model_settings
File ~/work/splink/splink/splink/em_training_session.py:239, in EMTrainingSession._train(self, cvv)
223 raise EMTrainingException(
224 f"Training rule {br_sql} resulted in no record pairs. "
225 "This means that in the supplied data set "
(...)
233 "the number of comparisons that will be generated by a blocking rule."
234 )
236 # Compute the new params, populating the paramters in the copied settings object
237 # At this stage, we do not overwrite any of the parameters
238 # in the original (main) setting object
--> 239 core_model_settings_history = expectation_maximisation(
240 db_api=self.db_api,
241 training_settings=self.training_settings,
242 estimate_without_term_frequencies=self.estimate_without_term_frequencies,
243 core_model_settings=self.core_model_settings,
244 unique_id_input_columns=self.unique_id_input_columns,
245 training_fixed_probabilities=self.training_fixed_probabilities,
246 df_comparison_vector_values=cvv,
247 )
248 self.core_model_settings = core_model_settings_history[-1]
249 self._core_model_settings_history = core_model_settings_history
File ~/work/splink/splink/splink/expectation_maximisation.py:308, in expectation_maximisation(db_api, training_settings, estimate_without_term_frequencies, core_model_settings, unique_id_input_columns, training_fixed_probabilities, df_comparison_vector_values)
306 if estimate_without_term_frequencies:
307 pipeline.append_input_dataframe(agreement_pattern_counts)
--> 308 df_params = db_api.sql_pipeline_to_splink_dataframe(pipeline)
309 else:
310 pipeline.append_input_dataframe(df_comparison_vector_values)
File ~/work/splink/splink/splink/database_api.py:203, in DatabaseAPI.sql_pipeline_to_splink_dataframe(self, pipeline, use_cache)
200 sql_gen = pipeline.generate_cte_pipeline_sql()
201 output_tablename_templated = pipeline.output_table_name
--> 203 splink_dataframe = self.sql_to_splink_dataframe_checking_cache(
204 sql_gen,
205 output_tablename_templated,
206 use_cache,
207 )
208 else:
209 # In debug mode, we do not pipeline the sql and print the
210 # results of each part of the pipeline
211 for cte in pipeline.ctes_pipeline():
File ~/work/splink/splink/splink/database_api.py:174, in DatabaseAPI.sql_to_splink_dataframe_checking_cache(self, sql, output_tablename_templated, use_cache)
171 print(df_pd) # noqa: T201
173 else:
--> 174 splink_dataframe = self._sql_to_splink_dataframe(
175 sql, output_tablename_templated, table_name_hash
176 )
178 splink_dataframe.created_by_splink = True
179 splink_dataframe.sql_used_to_create = sql
File ~/work/splink/splink/splink/database_api.py:95, in DatabaseAPI._sql_to_splink_dataframe(self, sql, templated_name, physical_name)
87 """
88 Create a table in the backend using some given sql
89
(...)
92 Returns a SplinkDataFrame which also uses templated_name
93 """
94 sql = self._setup_for_execute_sql(sql, physical_name)
---> 95 spark_df = self._log_and_run_sql_execution(sql, templated_name, physical_name)
96 output_df = self._cleanup_for_execute_sql(
97 spark_df, templated_name, physical_name
98 )
99 self._intermediate_table_cache.executed_queries.append(output_df)
File ~/work/splink/splink/splink/database_api.py:76, in DatabaseAPI._log_and_run_sql_execution(self, final_sql, templated_name, physical_name)
73 except Exception:
74 pass
---> 76 raise SplinkException(
77 f"Error executing the following sql for table "
78 f"`{templated_name}`({physical_name}):\n{final_sql}"
79 f"\n\nError was: {e}"
80 ) from e
SplinkException: Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_18687ce0b):
CREATE TABLE __splink__m_u_counts_18687ce0b AS
WITH
__splink__agreement_pattern_counts as (
select * from __splink__agreement_pattern_counts_3076a29b4),
__splink__df_match_weight_parts as (
select gamma_first_name,CASE
WHEN
gamma_first_name = -1
THEN cast(1.0 as float8)
WHEN
gamma_first_name = 2
THEN cast(77.35783803878434 as float8)
WHEN
gamma_first_name = 1
THEN cast(0.0011007765625665434 as float8)
WHEN
gamma_first_name = 0
THEN cast(0.018602948205839968 as float8)
END as bf_first_name,gamma_surname,CASE
WHEN
gamma_surname = -1
THEN cast(1.0 as float8)
WHEN
gamma_surname = 2
THEN cast(1765.0473905491197 as float8)
WHEN
gamma_surname = 1
THEN cast(0.0019234548247370445 as float8)
WHEN
gamma_surname = 0
THEN cast(0.01409343713614602 as float8)
END as bf_surname,gamma_postcode_fake,CASE
WHEN
gamma_postcode_fake = -1
THEN cast(1.0 as float8)
WHEN
gamma_postcode_fake = 3
THEN cast(9514.971484415264 as float8)
WHEN
gamma_postcode_fake = 2
THEN cast(0.015861999999999998 as float8)
WHEN
gamma_postcode_fake = 1
THEN cast(0.002266 as float8)
WHEN
gamma_postcode_fake = 0
THEN cast(0.00023429942855261672 as float8)
END as bf_postcode_fake,gamma_birth_place,CASE
WHEN
gamma_birth_place = -1
THEN cast(1.0 as float8)
WHEN
gamma_birth_place = 1
THEN cast(226.5320754716981 as float8)
WHEN
gamma_birth_place = 0
THEN cast(0.0 as float8)
END as bf_birth_place,gamma_occupation,CASE
WHEN
gamma_occupation = -1
THEN cast(1.0 as float8)
WHEN
gamma_occupation = 1
THEN cast(nan as float8)
WHEN
gamma_occupation = 0
THEN cast(nan as float8)
END as bf_occupation,agreement_pattern_count
from __splink__agreement_pattern_counts
),
__splink__df_predict as (
select
log2(cast(0.013693808738874281 as float8) * bf_first_name * bf_surname * bf_postcode_fake * bf_birth_place * bf_occupation) as match_weight,
CASE WHEN bf_first_name = 'infinity' OR bf_surname = 'infinity' OR bf_postcode_fake = 'infinity' OR bf_birth_place = 'infinity' OR bf_occupation = 'infinity' THEN 1.0 ELSE (cast(0.013693808738874281 as float8) * bf_first_name * bf_surname * bf_postcode_fake * bf_birth_place * bf_occupation)/(1+(cast(0.013693808738874281 as float8) * bf_first_name * bf_surname * bf_postcode_fake * bf_birth_place * bf_occupation)) END as match_probability,
gamma_first_name,bf_first_name,gamma_surname,bf_surname,gamma_postcode_fake,bf_postcode_fake,gamma_birth_place,bf_birth_place,gamma_occupation,bf_occupation,agreement_pattern_count
from __splink__df_match_weight_parts
)
select
gamma_first_name as comparison_vector_value,
sum(match_probability * agreement_pattern_count) as m_count,
sum((1-match_probability) * agreement_pattern_count) as u_count,
'first_name' as output_column_name
from __splink__df_predict
group by gamma_first_name
union all
select
gamma_surname as comparison_vector_value,
sum(match_probability * agreement_pattern_count) as m_count,
sum((1-match_probability) * agreement_pattern_count) as u_count,
'surname' as output_column_name
from __splink__df_predict
group by gamma_surname
union all
select
gamma_postcode_fake as comparison_vector_value,
sum(match_probability * agreement_pattern_count) as m_count,
sum((1-match_probability) * agreement_pattern_count) as u_count,
'postcode_fake' as output_column_name
from __splink__df_predict
group by gamma_postcode_fake
union all
select
gamma_birth_place as comparison_vector_value,
sum(match_probability * agreement_pattern_count) as m_count,
sum((1-match_probability) * agreement_pattern_count) as u_count,
'birth_place' as output_column_name
from __splink__df_predict
group by gamma_birth_place
union all
select
gamma_occupation as comparison_vector_value,
sum(match_probability * agreement_pattern_count) as m_count,
sum((1-match_probability) * agreement_pattern_count) as u_count,
'occupation' as output_column_name
from __splink__df_predict
group by gamma_occupation
union all
select 0 as comparison_vector_value,
sum(match_probability * agreement_pattern_count) /
sum(agreement_pattern_count) as m_count,
sum((1-match_probability) * agreement_pattern_count) /
sum(agreement_pattern_count) as u_count,
'_probability_two_random_records_match' as output_column_name
from __splink__df_predict
Error was: no such column: nan
============================== slowest durations ===============================
5.72s call docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
0.00s setup docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
0.00s teardown docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
=========================== short test summary info ============================
FAILED docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
============================== 1 failed in 6.36s ===============================
Error: Process completed with exit code 1.