Notebook test sometimes fails

Question

Notebook test sometimes fails

ADBond opened this issue 3 months ago · comments

Example notebook sometimes fails in sqlite, see this run. Something random as passed on re-run.

I have a feeling this has happened before also. Not massively urgent as not happening frequently, and can re-run the job, but would be nice to get to the bottom of why it is happening.

ADBond · Answer 1 · Tue May 14 2024 00:30:19 GMT+0800 (China Standard Time)

For reference:

Click for full output

============================= test session starts ==============================
platform linux -- Python 3.9.19, pytest-8.0.0, pluggy-1.5.0 -- /home/runner/work/splink/splink/.venv/bin/python
cachedir: .pytest_cache
rootdir: /home/runner/work/splink/splink
configfile: pyproject.toml
plugins: anyio-4.3.0, nbmake-1.5.0, xdist-3.5.0
created: 2/2 workers
2 workers [1 item]

scheduling tests via LoadScheduling

docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb:: 
[gw0] [100%] FAILED docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb:: 

=================================== FAILURES ===================================
_ /home/runner/work/splink/splink/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb _
[gw0] linux -- Python 3.9.19 /home/runner/work/splink/splink/.venv/bin/python
---------------------------------------------------------------------------
df_predict = linker.predict()
df_e = df_predict.as_pandas_dataframe(limit=5)
df_e
---------------------------------------------------------------------------
OperationalError                          Traceback (most recent call last)
File ~/work/splink/splink/splink/database_api.py:64, in DatabaseAPI._log_and_run_sql_execution(self, final_sql, templated_name, physical_name)
     63 try:
---> 64     return self._execute_sql_against_backend(final_sql)
     65 except Exception as e:
     66     # Parse our SQL through sqlglot to pretty print

File ~/work/splink/splink/splink/sqlite/database_api.py:103, in SQLiteAPI._execute_sql_against_backend(self, final_sql)
    102 def _execute_sql_against_backend(self, final_sql: str) -> sqlite3.Cursor:
--> 103     return self.con.execute(final_sql)

OperationalError: user-defined function raised exception

The above exception was the direct cause of the following exception:

SplinkException                           Traceback (most recent call last)
Cell In[12], line 1
----> 1 df_predict = linker.predict()
      2 df_e = df_predict.as_pandas_dataframe(limit=5)
      3 df_e

File ~/work/splink/splink/splink/linker.py:1143, in Linker.predict(self, threshold_match_probability, threshold_match_weight, materialise_after_computing_term_frequencies)
   1135 sqls = predict_from_comparison_vectors_sqls_using_settings(
   1136     self._settings_obj,
   1137     threshold_match_probability,
   1138     threshold_match_weight,
   1139     sql_infinity_expression=self._infinity_expression,
   1140 )
   1141 pipeline.enqueue_list_of_sqls(sqls)
-> 1143 predictions = self.db_api.sql_pipeline_to_splink_dataframe(pipeline)
   1144 self._predict_warning()
   1146 [b.drop_materialised_id_pairs_dataframe() for b in exploding_br_with_id_tables]

File ~/work/splink/splink/splink/database_api.py:203, in DatabaseAPI.sql_pipeline_to_splink_dataframe(self, pipeline, use_cache)
    200     sql_gen = pipeline.generate_cte_pipeline_sql()
    201     output_tablename_templated = pipeline.output_table_name
--> 203     splink_dataframe = self.sql_to_splink_dataframe_checking_cache(
    204         sql_gen,
    205         output_tablename_templated,
    206         use_cache,
    207     )
    208 else:
    209     # In debug mode, we do not pipeline the sql and print the
    210     # results of each part of the pipeline
    211     for cte in pipeline.ctes_pipeline():

File ~/work/splink/splink/splink/database_api.py:174, in DatabaseAPI.sql_to_splink_dataframe_checking_cache(self, sql, output_tablename_templated, use_cache)
    171         print(df_pd)  # noqa: T201
    173 else:
--> 174     splink_dataframe = self._sql_to_splink_dataframe(
    175         sql, output_tablename_templated, table_name_hash
    176     )
    178 splink_dataframe.created_by_splink = True
    179 splink_dataframe.sql_used_to_create = sql

File ~/work/splink/splink/splink/database_api.py:95, in DatabaseAPI._sql_to_splink_dataframe(self, sql, templated_name, physical_name)
     87 """
     88 Create a table in the backend using some given sql
     89 
   (...)
     92 Returns a SplinkDataFrame which also uses templated_name
     93 """
     94 sql = self._setup_for_execute_sql(sql, physical_name)
---> 95 spark_df = self._log_and_run_sql_execution(sql, templated_name, physical_name)
     96 output_df = self._cleanup_for_execute_sql(
     97     spark_df, templated_name, physical_name
     98 )
     99 self._intermediate_table_cache.executed_queries.append(output_df)

File ~/work/splink/splink/splink/database_api.py:76, in DatabaseAPI._log_and_run_sql_execution(self, final_sql, templated_name, physical_name)
     73 except Exception:
     74     pass
---> 76 raise SplinkException(
     77     f"Error executing the following sql for table "
     78     f"`{templated_name}`({physical_name}):\n{final_sql}"
     79     f"\n\nError was: {e}"
     80 ) from e

SplinkException: Error executing the following sql for table `__splink__df_predict`(__splink__df_predict_2a9a4ba75):
CREATE TABLE __splink__df_predict_2a9a4ba75 AS 
WITH

__splink__df_concat_with_tf as (
select * from __splink__df_concat_with_tf_b6af2d7ee), 

__splink__df_blocked as (
            select
            "l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."first_name" AS "first_name_l", "r"."first_name" AS "first_name_r", "l"."tf_first_name" AS "tf_first_name_l", "r"."tf_first_name" AS "tf_first_name_r", "l"."surname" AS "surname_l", "r"."surname" AS "surname_r", "l"."tf_surname" AS "tf_surname_l", "r"."tf_surname" AS "tf_surname_r", "l"."dob" AS "dob_l", "r"."dob" AS "dob_r", "l"."tf_dob" AS "tf_dob_l", "r"."tf_dob" AS "tf_dob_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."tf_birth_place" AS "tf_birth_place_l", "r"."tf_birth_place" AS "tf_birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r", "l"."tf_occupation" AS "tf_occupation_l", "r"."tf_occupation" AS "tf_occupation_r"
            , '0' as match_key
            
            from __splink__df_concat_with_tf as l
            inner join __splink__df_concat_with_tf as r
            on
            (l.first_name = r.first_name and l.surname = r.surname)
            where l."unique_id" < r."unique_id"
            
             UNION ALL 
            select
            "l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."first_name" AS "first_name_l", "r"."first_name" AS "first_name_r", "l"."tf_first_name" AS "tf_first_name_l", "r"."tf_first_name" AS "tf_first_name_r", "l"."surname" AS "surname_l", "r"."surname" AS "surname_r", "l"."tf_surname" AS "tf_surname_l", "r"."tf_surname" AS "tf_surname_r", "l"."dob" AS "dob_l", "r"."dob" AS "dob_r", "l"."tf_dob" AS "tf_dob_l", "r"."tf_dob" AS "tf_dob_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."tf_birth_place" AS "tf_birth_place_l", "r"."tf_birth_place" AS "tf_birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r", "l"."tf_occupation" AS "tf_occupation_l", "r"."tf_occupation" AS "tf_occupation_r"
            , '1' as match_key
            
            from __splink__df_concat_with_tf as l
            inner join __splink__df_concat_with_tf as r
            on
            (l.surname = r.surname and l.dob = r.dob)
            where l."unique_id" < r."unique_id"
            AND NOT (coalesce((l.first_name = r.first_name and l.surname = r.surname),false))
             UNION ALL 
            select
            "l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."first_name" AS "first_name_l", "r"."first_name" AS "first_name_r", "l"."tf_first_name" AS "tf_first_name_l", "r"."tf_first_name" AS "tf_first_name_r", "l"."surname" AS "surname_l", "r"."surname" AS "surname_r", "l"."tf_surname" AS "tf_surname_l", "r"."tf_surname" AS "tf_surname_r", "l"."dob" AS "dob_l", "r"."dob" AS "dob_r", "l"."tf_dob" AS "tf_dob_l", "r"."tf_dob" AS "tf_dob_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."tf_birth_place" AS "tf_birth_place_l", "r"."tf_birth_place" AS "tf_birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r", "l"."tf_occupation" AS "tf_occupation_l", "r"."tf_occupation" AS "tf_occupation_r"
            , '2' as match_key
            
            from __splink__df_concat_with_tf as l
            inner join __splink__df_concat_with_tf as r
            on
            (l.first_name = r.first_name and l.dob = r.dob)
            where l."unique_id" < r."unique_id"
            AND NOT (coalesce((l.first_name = r.first_name and l.surname = r.surname),false) OR coalesce((l.surname = r.surname and l.dob = r.dob),false))
             UNION ALL 
            select
            "l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."first_name" AS "first_name_l", "r"."first_name" AS "first_name_r", "l"."tf_first_name" AS "tf_first_name_l", "r"."tf_first_name" AS "tf_first_name_r", "l"."surname" AS "surname_l", "r"."surname" AS "surname_r", "l"."tf_surname" AS "tf_surname_l", "r"."tf_surname" AS "tf_surname_r", "l"."dob" AS "dob_l", "r"."dob" AS "dob_r", "l"."tf_dob" AS "tf_dob_l", "r"."tf_dob" AS "tf_dob_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."tf_birth_place" AS "tf_birth_place_l", "r"."tf_birth_place" AS "tf_birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r", "l"."tf_occupation" AS "tf_occupation_l", "r"."tf_occupation" AS "tf_occupation_r"
            , '3' as match_key
            
            from __splink__df_concat_with_tf as l
            inner join __splink__df_concat_with_tf as r
            on
            (l.postcode_fake = r.postcode_fake and l.first_name = r.first_name)
            where l."unique_id" < r."unique_id"
            AND NOT (coalesce((l.first_name = r.first_name and l.surname = r.surname),false) OR coalesce((l.surname = r.surname and l.dob = r.dob),false) OR coalesce((l.first_name = r.first_name and l.dob = r.dob),false))
            ), 

__splink__df_comparison_vectors as (
    select "unique_id_l","unique_id_r","first_name_l","first_name_r",CASE WHEN "first_name_l" IS NULL OR "first_name_r" IS NULL THEN -1 WHEN "first_name_l" = "first_name_r" THEN 2 WHEN jaro_winkler("first_name_l", "first_name_r") >= 0.9 THEN 1 ELSE 0 END as gamma_first_name,"tf_first_name_l","tf_first_name_r","surname_l","surname_r",CASE WHEN "surname_l" IS NULL OR "surname_r" IS NULL THEN -1 WHEN "surname_l" = "surname_r" THEN 2 WHEN jaro_winkler("surname_l", "surname_r") >= 0.9 THEN 1 ELSE 0 END as gamma_surname,"tf_surname_l","tf_surname_r","dob_l","dob_r",CASE WHEN "dob_l" IS NULL OR "dob_r" IS NULL THEN -1 WHEN "dob_l" = "dob_r" THEN 3 WHEN damerau_levenshtein("dob_l", "dob_r") <= 1 THEN 2 WHEN damerau_levenshtein("dob_l", "dob_r") <= 2 THEN 1 ELSE 0 END as gamma_dob,"tf_dob_l","tf_dob_r","postcode_fake_l","postcode_fake_r",CASE WHEN "postcode_fake_l" IS NULL OR "postcode_fake_r" IS NULL THEN -1 WHEN "postcode_fake_l" = "postcode_fake_r" THEN 3 WHEN damerau_levenshtein("postcode_fake_l", "postcode_fake_r") <= 1 THEN 2 WHEN damerau_levenshtein("postcode_fake_l", "postcode_fake_r") <= 2 THEN 1 ELSE 0 END as gamma_postcode_fake,"birth_place_l","birth_place_r",CASE WHEN "birth_place_l" IS NULL OR "birth_place_r" IS NULL THEN -1 WHEN "birth_place_l" = "birth_place_r" THEN 1 ELSE 0 END as gamma_birth_place,"tf_birth_place_l","tf_birth_place_r","occupation_l","occupation_r",CASE WHEN "occupation_l" IS NULL OR "occupation_r" IS NULL THEN -1 WHEN "occupation_l" = "occupation_r" THEN 1 ELSE 0 END as gamma_occupation,"tf_occupation_l","tf_occupation_r",match_key 
    from __splink__df_blocked
    ), 

__splink__df_match_weight_parts as (
    select "unique_id_l","unique_id_r","first_name_l","first_name_r",gamma_first_name,"tf_first_name_l","tf_first_name_r",CASE 
WHEN
gamma_first_name = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_first_name = 2
THEN cast(22.538194655324588 as float8)
 
WHEN
gamma_first_name = 1
THEN cast(8.398460821344122e-10 as float8)
 
WHEN
gamma_first_name = 0
THEN cast(0.9595244959440041 as float8)
 END as bf_first_name ,CASE WHEN  gamma_first_name = -1 then cast(1 as float8) WHEN  gamma_first_name = 2 then
    (CASE WHEN coalesce("tf_first_name_l", "tf_first_name_r") is not null
    THEN
    POW(
        cast(0.015050125313283208 as float8) /
    (CASE
        WHEN coalesce("tf_first_name_l", "tf_first_name_r") >= coalesce("tf_first_name_r", "tf_first_name_l")
        THEN coalesce("tf_first_name_l", "tf_first_name_r")
        ELSE coalesce("tf_first_name_r", "tf_first_name_l")
    END)
    ,
        cast(1.0 as float8)
    )
    ELSE cast(1 as float8)
    END) WHEN  gamma_first_name = 1 then cast(1 as float8) WHEN  gamma_first_name = 0 then cast(1 as float8) END as bf_tf_adj_first_name ,"surname_l","surname_r",gamma_surname,"tf_surname_l","tf_surname_r",CASE 
WHEN
gamma_surname = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_surname = 2
THEN cast(829.7339196865737 as float8)
 
WHEN
gamma_surname = 1
THEN cast(2.347650066419875e-274 as float8)
 
WHEN
gamma_surname = 0
THEN cast(0.5640101767895112 as float8)
 END as bf_surname ,CASE WHEN  gamma_surname = -1 then cast(1 as float8) WHEN  gamma_surname = 2 then
    (CASE WHEN coalesce("tf_surname_l", "tf_surname_r") is not null
    THEN
    POW(
        cast(0.0006437607605359682 as float8) /
    (CASE
        WHEN coalesce("tf_surname_l", "tf_surname_r") >= coalesce("tf_surname_r", "tf_surname_l")
        THEN coalesce("tf_surname_l", "tf_surname_r")
        ELSE coalesce("tf_surname_r", "tf_surname_l")
    END)
    ,
        cast(1.0 as float8)
    )
    ELSE cast(1 as float8)
    END) WHEN  gamma_surname = 1 then cast(1 as float8) WHEN  gamma_surname = 0 then cast(1 as float8) END as bf_tf_adj_surname ,"dob_l","dob_r",gamma_dob,"tf_dob_l","tf_dob_r",CASE 
WHEN
gamma_dob = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_dob = 3
THEN cast(599.0777027027026 as float8)
 
WHEN
gamma_dob = 2
THEN cast(49.328752642706135 as float8)
 
WHEN
gamma_dob = 1
THEN cast(0.2054278922345485 as float8)
 
WHEN
gamma_dob = 0
THEN cast(1.1247139075244796e-34 as float8)
 END as bf_dob ,CASE WHEN  gamma_dob = -1 then cast(1 as float8) WHEN  gamma_dob = 3 then
    (CASE WHEN coalesce("tf_dob_l", "tf_dob_r") is not null
    THEN
    POW(
        cast(0.001585770920390014 as float8) /
    (CASE
        WHEN coalesce("tf_dob_l", "tf_dob_r") >= coalesce("tf_dob_r", "tf_dob_l")
        THEN coalesce("tf_dob_l", "tf_dob_r")
        ELSE coalesce("tf_dob_r", "tf_dob_l")
    END)
    ,
        cast(1.0 as float8)
    )
    ELSE cast(1 as float8)
    END) WHEN  gamma_dob = 2 then cast(1 as float8) WHEN  gamma_dob = 1 then cast(1 as float8) WHEN  gamma_dob = 0 then cast(1 as float8) END as bf_tf_adj_dob ,"postcode_fake_l","postcode_fake_r",gamma_postcode_fake,CASE 
WHEN
gamma_postcode_fake = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_postcode_fake = 3
THEN cast(6939.673139652026 as float8)
 
WHEN
gamma_postcode_fake = 2
THEN cast(23793.00028541576 as float8)
 
WHEN
gamma_postcode_fake = 1
THEN cast(689.6354172768708 as float8)
 
WHEN
gamma_postcode_fake = 0
THEN cast(1.109823362613782e-64 as float8)
 END as bf_postcode_fake ,"birth_place_l","birth_place_r",gamma_birth_place,"tf_birth_place_l","tf_birth_place_r",CASE 
WHEN
gamma_birth_place = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_birth_place = 1
THEN cast(219.43014705882354 as float8)
 
WHEN
gamma_birth_place = 0
THEN cast(6.648202339322792e-35 as float8)
 END as bf_birth_place ,CASE WHEN  gamma_birth_place = -1 then cast(1 as float8) WHEN  gamma_birth_place = 1 then
    (CASE WHEN coalesce("tf_birth_place_l", "tf_birth_place_r") is not null
    THEN
    POW(
        cast(0.0045572589427829435 as float8) /
    (CASE
        WHEN coalesce("tf_birth_place_l", "tf_birth_place_r") >= coalesce("tf_birth_place_r", "tf_birth_place_l")
        THEN coalesce("tf_birth_place_l", "tf_birth_place_r")
        ELSE coalesce("tf_birth_place_r", "tf_birth_place_l")
    END)
    ,
        cast(1.0 as float8)
    )
    ELSE cast(1 as float8)
    END) WHEN  gamma_birth_place = 0 then cast(1 as float8) END as bf_tf_adj_birth_place ,"occupation_l","occupation_r",gamma_occupation,"tf_occupation_l","tf_occupation_r",CASE 
WHEN
gamma_occupation = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_occupation = 1
THEN cast(23.141810460746804 as float8)
 
WHEN
gamma_occupation = 0
THEN cast(0.1645048944611773 as float8)
 END as bf_occupation ,CASE WHEN  gamma_occupation = -1 then cast(1 as float8) WHEN  gamma_occupation = 1 then
    (CASE WHEN coalesce("tf_occupation_l", "tf_occupation_r") is not null
    THEN
    POW(
        cast(0.036361752822956596 as float8) /
    (CASE
        WHEN coalesce("tf_occupation_l", "tf_occupation_r") >= coalesce("tf_occupation_r", "tf_occupation_l")
        THEN coalesce("tf_occupation_l", "tf_occupation_r")
        ELSE coalesce("tf_occupation_r", "tf_occupation_l")
    END)
    ,
        cast(1.0 as float8)
    )
    ELSE cast(1 as float8)
    END) WHEN  gamma_occupation = 0 then cast(1 as float8) END as bf_tf_adj_occupation ,match_key 
    from __splink__df_comparison_vectors
    ) 

    select
    log2(cast(8.354916868577158e-05 as float8) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_tf_adj_surname * bf_dob * bf_tf_adj_dob * bf_postcode_fake * bf_birth_place * bf_tf_adj_birth_place * bf_occupation * bf_tf_adj_occupation) as match_weight,
    CASE WHEN bf_first_name = 'infinity' OR bf_tf_adj_first_name = 'infinity' OR bf_surname = 'infinity' OR bf_tf_adj_surname = 'infinity' OR bf_dob = 'infinity' OR bf_tf_adj_dob = 'infinity' OR bf_postcode_fake = 'infinity' OR bf_birth_place = 'infinity' OR bf_tf_adj_birth_place = 'infinity' OR bf_occupation = 'infinity' OR bf_tf_adj_occupation = 'infinity' THEN 1.0 ELSE (cast(8.354916868577158e-05 as float8) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_tf_adj_surname * bf_dob * bf_tf_adj_dob * bf_postcode_fake * bf_birth_place * bf_tf_adj_birth_place * bf_occupation * bf_tf_adj_occupation)/(1+(cast(8.354916868577158e-05 as float8) * bf_first_name * bf_tf_adj_first_name * bf_surname * bf_tf_adj_surname * bf_dob * bf_tf_adj_dob * bf_postcode_fake * bf_birth_place * bf_tf_adj_birth_place * bf_occupation * bf_tf_adj_occupation)) END as match_probability,
    "unique_id_l","unique_id_r","first_name_l","first_name_r",gamma_first_name,"tf_first_name_l","tf_first_name_r",bf_first_name,bf_tf_adj_first_name,"surname_l","surname_r",gamma_surname,"tf_surname_l","tf_surname_r",bf_surname,bf_tf_adj_surname,"dob_l","dob_r",gamma_dob,"tf_dob_l","tf_dob_r",bf_dob,bf_tf_adj_dob,"postcode_fake_l","postcode_fake_r",gamma_postcode_fake,bf_postcode_fake,"birth_place_l","birth_place_r",gamma_birth_place,"tf_birth_place_l","tf_birth_place_r",bf_birth_place,bf_tf_adj_birth_place,"occupation_l","occupation_r",gamma_occupation,"tf_occupation_l","tf_occupation_r",bf_occupation,bf_tf_adj_occupation,match_key 
    from __splink__df_match_weight_parts
    
    
    

Error was: user-defined function raised exception
============================== slowest durations ===============================
7.79s call     docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
0.00s setup    docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
0.00s teardown docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
=========================== short test summary info ============================
FAILED docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
============================== 1 failed in 8.36s ===============================
Error: Process completed with exit code 1.

ADBond · Answer 2 · Tue May 14 2024 23:37:38 GMT+0800 (China Standard Time)

See also: this run

Run logs

============================= test session starts ==============================
platform linux -- Python 3.9.19, pytest-8.0.0, pluggy-1.5.0 -- /home/runner/work/splink/splink/.venv/bin/python
cachedir: .pytest_cache
rootdir: /home/runner/work/splink/splink
configfile: pyproject.toml
plugins: anyio-4.3.0, nbmake-1.5.0, xdist-3.5.0
created: 2/2 workers
2 workers [1 item]

scheduling tests via LoadScheduling

docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb:: 
[gw0] [100%] FAILED docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb:: 

=================================== FAILURES ===================================
_ /home/runner/work/splink/splink/docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb _
[gw0] linux -- Python 3.9.19 /home/runner/work/splink/splink/.venv/bin/python
---------------------------------------------------------------------------
training_blocking_rule = "l.dob = r.dob"
training_session_dob = linker.estimate_parameters_using_expectation_maximisation(
    training_blocking_rule, estimate_without_term_frequencies=True
)
---------------------------------------------------------------------------
OperationalError                          Traceback (most recent call last)
File ~/work/splink/splink/splink/database_api.py:64, in DatabaseAPI._log_and_run_sql_execution(self, final_sql, templated_name, physical_name)
     63 try:
---> 64     return self._execute_sql_against_backend(final_sql)
     65 except Exception as e:
     66     # Parse our SQL through sqlglot to pretty print

File ~/work/splink/splink/splink/sqlite/database_api.py:103, in SQLiteAPI._execute_sql_against_backend(self, final_sql)
    102 def _execute_sql_against_backend(self, final_sql: str) -> sqlite3.Cursor:
--> 103     return self.con.execute(final_sql)

OperationalError: no such column: nan

The above exception was the direct cause of the following exception:

SplinkException                           Traceback (most recent call last)
Cell In[9], line 2
      1 training_blocking_rule = "l.dob = r.dob"
----> 2 training_session_dob = linker.estimate_parameters_using_expectation_maximisation(
      3     training_blocking_rule, estimate_without_term_frequencies=True
      4 )

File ~/work/splink/splink/splink/linker.py:1027, in Linker.estimate_parameters_using_expectation_maximisation(self, blocking_rule, comparisons_to_deactivate, comparison_levels_to_reverse_blocking_rule, estimate_without_term_frequencies, fix_probability_two_random_records_match, fix_m_probabilities, fix_u_probabilities, populate_probability_two_random_records_match_from_trained_values)
   1001         logger.warning(
   1002             "\nWARNING: \n"
   1003             "You have provided comparisons_to_deactivate but not "
   (...)
   1009             "as an exact match."
   1010         )
   1012 em_training_session = EMTrainingSession(
   1013     self,
   1014     db_api=self.db_api,
   (...)
   1024     estimate_without_term_frequencies=estimate_without_term_frequencies,
   1025 )
-> 1027 core_model_settings = em_training_session._train()
   1028 # overwrite with the newly trained values in our linker settings
   1029 self._settings_obj.core_model_settings = core_model_settings

File ~/work/splink/splink/splink/em_training_session.py:239, in EMTrainingSession._train(self, cvv)
    223     raise EMTrainingException(
    224         f"Training rule {br_sql} resulted in no record pairs.  "
    225         "This means that in the supplied data set "
   (...)
    233         "the number of comparisons that will be generated by a blocking rule."
    234     )
    236 # Compute the new params, populating the paramters in the copied settings object
    237 # At this stage, we do not overwrite any of the parameters
    238 # in the original (main) setting object
--> 239 core_model_settings_history = expectation_maximisation(
    240     db_api=self.db_api,
    241     training_settings=self.training_settings,
    242     estimate_without_term_frequencies=self.estimate_without_term_frequencies,
    243     core_model_settings=self.core_model_settings,
    244     unique_id_input_columns=self.unique_id_input_columns,
    245     training_fixed_probabilities=self.training_fixed_probabilities,
    246     df_comparison_vector_values=cvv,
    247 )
    248 self.core_model_settings = core_model_settings_history[-1]
    249 self._core_model_settings_history = core_model_settings_history

File ~/work/splink/splink/splink/expectation_maximisation.py:308, in expectation_maximisation(db_api, training_settings, estimate_without_term_frequencies, core_model_settings, unique_id_input_columns, training_fixed_probabilities, df_comparison_vector_values)
    306 if estimate_without_term_frequencies:
    307     pipeline.append_input_dataframe(agreement_pattern_counts)
--> 308     df_params = db_api.sql_pipeline_to_splink_dataframe(pipeline)
    309 else:
    310     pipeline.append_input_dataframe(df_comparison_vector_values)

File ~/work/splink/splink/splink/database_api.py:203, in DatabaseAPI.sql_pipeline_to_splink_dataframe(self, pipeline, use_cache)
    200     sql_gen = pipeline.generate_cte_pipeline_sql()
    201     output_tablename_templated = pipeline.output_table_name
--> 203     splink_dataframe = self.sql_to_splink_dataframe_checking_cache(
    204         sql_gen,
    205         output_tablename_templated,
    206         use_cache,
    207     )
    208 else:
    209     # In debug mode, we do not pipeline the sql and print the
    210     # results of each part of the pipeline
    211     for cte in pipeline.ctes_pipeline():

File ~/work/splink/splink/splink/database_api.py:174, in DatabaseAPI.sql_to_splink_dataframe_checking_cache(self, sql, output_tablename_templated, use_cache)
    171         print(df_pd)  # noqa: T201
    173 else:
--> 174     splink_dataframe = self._sql_to_splink_dataframe(
    175         sql, output_tablename_templated, table_name_hash
    176     )
    178 splink_dataframe.created_by_splink = True
    179 splink_dataframe.sql_used_to_create = sql

File ~/work/splink/splink/splink/database_api.py:95, in DatabaseAPI._sql_to_splink_dataframe(self, sql, templated_name, physical_name)
     87 """
     88 Create a table in the backend using some given sql
     89 
   (...)
     92 Returns a SplinkDataFrame which also uses templated_name
     93 """
     94 sql = self._setup_for_execute_sql(sql, physical_name)
---> 95 spark_df = self._log_and_run_sql_execution(sql, templated_name, physical_name)
     96 output_df = self._cleanup_for_execute_sql(
     97     spark_df, templated_name, physical_name
     98 )
     99 self._intermediate_table_cache.executed_queries.append(output_df)

File ~/work/splink/splink/splink/database_api.py:76, in DatabaseAPI._log_and_run_sql_execution(self, final_sql, templated_name, physical_name)
     73 except Exception:
     74     pass
---> 76 raise SplinkException(
     77     f"Error executing the following sql for table "
     78     f"`{templated_name}`({physical_name}):\n{final_sql}"
     79     f"\n\nError was: {e}"
     80 ) from e

SplinkException: Error executing the following sql for table `__splink__m_u_counts`(__splink__m_u_counts_18687ce0b):
CREATE TABLE __splink__m_u_counts_18687ce0b AS 
WITH

__splink__agreement_pattern_counts as (
select * from __splink__agreement_pattern_counts_3076a29b4), 

__splink__df_match_weight_parts as (
    select gamma_first_name,CASE 
WHEN
gamma_first_name = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_first_name = 2
THEN cast(77.35783803878434 as float8)
 
WHEN
gamma_first_name = 1
THEN cast(0.0011007765625665434 as float8)
 
WHEN
gamma_first_name = 0
THEN cast(0.018602948205839968 as float8)
 END as bf_first_name,gamma_surname,CASE 
WHEN
gamma_surname = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_surname = 2
THEN cast(1765.0473905491197 as float8)
 
WHEN
gamma_surname = 1
THEN cast(0.0019234548247370445 as float8)
 
WHEN
gamma_surname = 0
THEN cast(0.01409343713614602 as float8)
 END as bf_surname,gamma_postcode_fake,CASE 
WHEN
gamma_postcode_fake = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_postcode_fake = 3
THEN cast(9514.971484415264 as float8)
 
WHEN
gamma_postcode_fake = 2
THEN cast(0.015861999999999998 as float8)
 
WHEN
gamma_postcode_fake = 1
THEN cast(0.002266 as float8)
 
WHEN
gamma_postcode_fake = 0
THEN cast(0.00023429942855261672 as float8)
 END as bf_postcode_fake,gamma_birth_place,CASE 
WHEN
gamma_birth_place = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_birth_place = 1
THEN cast(226.5320754716981 as float8)
 
WHEN
gamma_birth_place = 0
THEN cast(0.0 as float8)
 END as bf_birth_place,gamma_occupation,CASE 
WHEN
gamma_occupation = -1
THEN cast(1.0 as float8)
 
WHEN
gamma_occupation = 1
THEN cast(nan as float8)
 
WHEN
gamma_occupation = 0
THEN cast(nan as float8)
 END as bf_occupation,agreement_pattern_count
    from __splink__agreement_pattern_counts
    ), 

__splink__df_predict as (
    select
    log2(cast(0.013693808738874281 as float8) * bf_first_name * bf_surname * bf_postcode_fake * bf_birth_place * bf_occupation) as match_weight,
    CASE WHEN bf_first_name = 'infinity' OR bf_surname = 'infinity' OR bf_postcode_fake = 'infinity' OR bf_birth_place = 'infinity' OR bf_occupation = 'infinity' THEN 1.0 ELSE (cast(0.013693808738874281 as float8) * bf_first_name * bf_surname * bf_postcode_fake * bf_birth_place * bf_occupation)/(1+(cast(0.013693808738874281 as float8) * bf_first_name * bf_surname * bf_postcode_fake * bf_birth_place * bf_occupation)) END as match_probability,
    gamma_first_name,bf_first_name,gamma_surname,bf_surname,gamma_postcode_fake,bf_postcode_fake,gamma_birth_place,bf_birth_place,gamma_occupation,bf_occupation,agreement_pattern_count
    from __splink__df_match_weight_parts
    ) 

    select
    gamma_first_name as comparison_vector_value,
    sum(match_probability * agreement_pattern_count) as m_count,
    sum((1-match_probability) * agreement_pattern_count) as u_count,
    'first_name' as output_column_name
    from __splink__df_predict
    group by gamma_first_name
     union all 
    select
    gamma_surname as comparison_vector_value,
    sum(match_probability * agreement_pattern_count) as m_count,
    sum((1-match_probability) * agreement_pattern_count) as u_count,
    'surname' as output_column_name
    from __splink__df_predict
    group by gamma_surname
     union all 
    select
    gamma_postcode_fake as comparison_vector_value,
    sum(match_probability * agreement_pattern_count) as m_count,
    sum((1-match_probability) * agreement_pattern_count) as u_count,
    'postcode_fake' as output_column_name
    from __splink__df_predict
    group by gamma_postcode_fake
     union all 
    select
    gamma_birth_place as comparison_vector_value,
    sum(match_probability * agreement_pattern_count) as m_count,
    sum((1-match_probability) * agreement_pattern_count) as u_count,
    'birth_place' as output_column_name
    from __splink__df_predict
    group by gamma_birth_place
     union all 
    select
    gamma_occupation as comparison_vector_value,
    sum(match_probability * agreement_pattern_count) as m_count,
    sum((1-match_probability) * agreement_pattern_count) as u_count,
    'occupation' as output_column_name
    from __splink__df_predict
    group by gamma_occupation
     union all 
    select 0 as comparison_vector_value,
           sum(match_probability * agreement_pattern_count) /
               sum(agreement_pattern_count) as m_count,
           sum((1-match_probability) * agreement_pattern_count) /
               sum(agreement_pattern_count) as u_count,
           '_probability_two_random_records_match' as output_column_name
    from __splink__df_predict
    

Error was: no such column: nan
============================== slowest durations ===============================
5.72s call     docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
0.00s setup    docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
0.00s teardown docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
=========================== short test summary info ============================
FAILED docs/demos/examples/sqlite/deduplicate_50k_synthetic.ipynb::
============================== 1 failed in 6.36s ===============================
Error: Process completed with exit code 1.