Incompatible types error for joins not occurring.
JoDudding opened this issue · comments
When joining tables where the join variable has different classes (character and numeric) in each table, the join does not fail as it does in dplyr. Instead it converts to character. I'm using sparklyr 1.7.8.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(sparklyr)
#>
#> Attaching package: 'sparklyr'
#> The following object is masked from 'package:stats':
#>
#> filter
# spark connection
user <- Sys.getenv("USER")
conf <- sparklyr::spark_config()
conf$spark.executor.instances <- 2
conf$spark.executor.cores <- 2
conf$spark.executor.memory <- "8G"
conf$spark.driver.maxResultSize <- "6G"
conf$spark.dynamicAllocation.executorIdleTimeout <- "60s"
conf$spark.dynamicAllocation.cachedExecutorIdleTimeout <- "20m"
conf$spark.dynamicAllocation.initialExecutors <- 1
conf$spark.dynamicAllocation.minExecutors <- 0
conf$spark.dynamicAllocation.maxExecutors <- 8
conf$spark.kryoserializer.buffer.max <- "1G"
# Spark connection
sc <<- sparklyr::spark_connect(
master = "yarn-client",
version = "2.4.3",
config = conf
)
# create example tables
cust_numeric <- tibble(customer_no = as.integer(c(
10000001,
20000002,
30000003,
40000004,
50000005,
60000006,
70000007,
80000008,
90000009
)))
cust_character <- tibble(customer_no = c(
'10000001',
'20000002',
'70000007',
'80000008',
'90000009'
))
# join in both directions - fails as expected
cust_numeric |>
full_join(cust_character, by = 'customer_no')|>
print()
#> Error in `full_join()`:
#> ! Can't join on `x$customer_no` x `y$customer_no` because of
#> incompatible types.
#> ℹ `x$customer_no` is of type <integer>>.
#> ℹ `y$customer_no` is of type <character>>.
#> Backtrace:
#> ▆
#> 1. ├─base::print(full_join(cust_numeric, cust_character, by = "customer_no"))
#> 2. ├─dplyr::full_join(cust_numeric, cust_character, by = "customer_no")
#> 3. └─dplyr:::full_join.data.frame(cust_numeric, cust_character, by = "customer_no")
#> 4. └─dplyr:::join_mutate(...)
#> 5. └─dplyr:::join_rows(...)
#> 6. └─base::tryCatch(...)
#> 7. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 8. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#> 9. └─value[[3L]](cond)
#> 10. └─rlang::abort(bullets, call = error_call)
# Error in `left_join()`:
# ! Can't join on `x$customer_no` x `y$customer_no` because of incompatible types.
# ℹ `x$customer_no` is of type <double>>.
# ℹ `y$customer_no` is of type <character>>.
cust_character |>
full_join(cust_numeric, by = 'customer_no') |>
print()
#> Error in `full_join()`:
#> ! Can't join on `x$customer_no` x `y$customer_no` because of
#> incompatible types.
#> ℹ `x$customer_no` is of type <character>>.
#> ℹ `y$customer_no` is of type <integer>>.
#> Backtrace:
#> ▆
#> 1. ├─base::print(full_join(cust_character, cust_numeric, by = "customer_no"))
#> 2. ├─dplyr::full_join(cust_character, cust_numeric, by = "customer_no")
#> 3. └─dplyr:::full_join.data.frame(cust_character, cust_numeric, by = "customer_no")
#> 4. └─dplyr:::join_mutate(...)
#> 5. └─dplyr:::join_rows(...)
#> 6. └─base::tryCatch(...)
#> 7. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 8. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#> 9. └─value[[3L]](cond)
#> 10. └─rlang::abort(bullets, call = error_call)
# Error in `left_join()`:
# ! Can't join on `x$customer_no` x `y$customer_no` because of incompatible types.
# ℹ `x$customer_no` is of type <character>>.
# ℹ `y$customer_no` is of type <double>>.
# copy to spark
cust_numeric_spark <- copy_to(sc, cust_numeric, overwrite = TRUE)
cust_character_spark <- copy_to(sc, cust_character, overwrite = TRUE)
# join in both directions but using spark - no errors
cust_numeric_spark |>
full_join(cust_character_spark, by = 'customer_no') |>
print()
#> # Source: spark<?> [?? x 1]
#> customer_no
#> <chr>
#> 1 60000006
#> 2 10000001
#> 3 40000004
#> 4 20000002
#> 5 70000007
#> 6 80000008
#> 7 30000003
#> 8 90000009
#> 9 50000005
cust_character_spark |>
full_join(cust_numeric_spark, by = 'customer_no') |>
print()
#> # Source: spark<?> [?? x 1]
#> customer_no
#> <chr>
#> 1 60000006
#> 2 10000001
#> 3 40000004
#> 4 20000002
#> 5 70000007
#> 6 80000008
#> 7 30000003
#> 8 90000009
#> 9 50000005
Created on 2023-11-27 with reprex v2.0.2
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.1.3 (2022-03-10)
#> os RHEL
#> system x86_64, linux-gnu
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Pacific/Auckland
#> date 2023-11-27
#> pandoc 2.2.3.2 @ /data/disk1/anaconda3/bin/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> askpass 1.1 2019-01-13 [1] CRAN (R 4.1.0)
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.1.0)
#> base64enc 0.1-3 2015-07-28 [1] CRAN (R 4.1.0)
#> blob 1.2.3 2022-04-10 [1] CRAN (R 4.1.3)
#> cli 3.4.1 2022-09-23 [1] CRAN (R 4.1.3)
#> config 0.3.1 2020-12-17 [1] CRAN (R 4.1.0)
#> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.1.3)
#> dbplyr 2.2.1 2022-06-27 [1] CRAN (R 4.1.3)
#> digest 0.6.29 2021-12-01 [1] CRAN (R 4.1.1)
#> dplyr * 1.0.10 2022-09-01 [1] CRAN (R 4.1.3)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.1.0)
#> evaluate 0.16 2022-08-09 [1] CRAN (R 4.1.3)
#> fansi 1.0.3 2022-03-24 [1] CRAN (R 4.1.2)
#> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.1.0)
#> forge 0.2.0 2019-02-26 [1] CRAN (R 4.1.0)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.1.3)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.1.3)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.1.2)
#> highr 0.9 2021-04-16 [1] CRAN (R 4.1.0)
#> htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.1.3)
#> htmlwidgets 1.5.4 2021-09-08 [1] CRAN (R 4.1.1)
#> httr 1.4.4 2022-08-17 [1] CRAN (R 4.1.3)
#> jsonlite 1.8.0 2022-02-22 [1] CRAN (R 4.1.2)
#> knitr 1.40 2022-08-24 [1] CRAN (R 4.1.3)
#> lifecycle 1.0.2 2022-09-09 [1] CRAN (R 4.1.3)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.1.3)
#> openssl 2.0.3 2022-09-14 [1] CRAN (R 4.1.3)
#> pillar 1.8.1 2022-08-19 [1] CRAN (R 4.1.3)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.1.0)
#> purrr 0.3.4 2020-04-17 [1] CRAN (R 4.1.0)
#> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.1.3)
#> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.1.3)
#> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.1.3)
#> R.utils 2.12.0 2022-06-28 [1] CRAN (R 4.1.3)
#> r2d3 0.2.6 2022-02-28 [1] CRAN (R 4.1.2)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.1.1)
#> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.1.3)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.1.3)
#> rmarkdown 2.16 2022-08-24 [1] CRAN (R 4.1.3)
#> rprojroot 2.0.3 2022-04-02 [1] CRAN (R 4.1.3)
#> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.1.3)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.1.1)
#> sparklyr * 1.7.8 2022-08-16 [1] CRAN (R 4.1.3)
#> stringi 1.7.8 2022-07-11 [1] CRAN (R 4.1.3)
#> stringr 1.4.1 2022-08-20 [1] CRAN (R 4.1.3)
#> styler 1.7.0 2022-03-13 [1] CRAN (R 4.1.2)
#> tibble 3.1.8 2022-07-22 [1] CRAN (R 4.1.3)
#> tidyr 1.2.1 2022-09-08 [1] CRAN (R 4.1.3)
#> tidyselect 1.1.2 2022-02-21 [1] CRAN (R 4.1.2)
#> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.1.0)
#> vctrs 0.4.1 2022-04-13 [1] CRAN (R 4.1.3)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.1.2)
#> xfun 0.33 2022-09-12 [1] CRAN (R 4.1.3)
#> yaml 2.3.5 2022-02-21 [1] CRAN (R 4.1.2)
#>
#> [1] /data/disk1/anaconda3/envs/gcenv_R413_202209/lib/R/library
#>
#> ──────────────────────────────────────────────────────────────────────────────