sparklyr / sparklyr

R interface for Apache Spark

Home Page:https://spark.rstudio.com/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Incompatible types error for joins not occurring.

JoDudding opened this issue · comments

When joining tables where the join variable has different classes (character and numeric) in each table, the join does not fail as it does in dplyr. Instead it converts to character. I'm using sparklyr 1.7.8.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(sparklyr)
#> 
#> Attaching package: 'sparklyr'
#> The following object is masked from 'package:stats':
#> 
#>     filter

# spark connection

user <- Sys.getenv("USER")

conf <- sparklyr::spark_config()

conf$spark.executor.instances <- 2
conf$spark.executor.cores <- 2
conf$spark.executor.memory <- "8G"
conf$spark.driver.maxResultSize <- "6G"
conf$spark.dynamicAllocation.executorIdleTimeout <- "60s"
conf$spark.dynamicAllocation.cachedExecutorIdleTimeout <- "20m"
conf$spark.dynamicAllocation.initialExecutors <- 1
conf$spark.dynamicAllocation.minExecutors <- 0
conf$spark.dynamicAllocation.maxExecutors <- 8
conf$spark.kryoserializer.buffer.max <- "1G"

# Spark connection
sc <<- sparklyr::spark_connect(
  master = "yarn-client", 
  version = "2.4.3", 
  config = conf
)


# create example tables

cust_numeric <- tibble(customer_no = as.integer(c(
  10000001,
  20000002,
  30000003,
  40000004,
  50000005,
  60000006,
  70000007,
  80000008,
  90000009
)))

cust_character <- tibble(customer_no = c(
  '10000001',
  '20000002',
  '70000007',
  '80000008',
  '90000009'
))

# join in both directions - fails as expected

cust_numeric |>
  full_join(cust_character, by = 'customer_no')|>
  print()
#> Error in `full_join()`:
#> ! Can't join on `x$customer_no` x `y$customer_no` because of
#>   incompatible types.
#> ℹ `x$customer_no` is of type <integer>>.
#> ℹ `y$customer_no` is of type <character>>.

#> Backtrace:
#>      ▆
#>   1. ├─base::print(full_join(cust_numeric, cust_character, by = "customer_no"))
#>   2. ├─dplyr::full_join(cust_numeric, cust_character, by = "customer_no")
#>   3. └─dplyr:::full_join.data.frame(cust_numeric, cust_character, by = "customer_no")
#>   4.   └─dplyr:::join_mutate(...)
#>   5.     └─dplyr:::join_rows(...)
#>   6.       └─base::tryCatch(...)
#>   7.         └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#>   8.           └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#>   9.             └─value[[3L]](cond)
#>  10.               └─rlang::abort(bullets, call = error_call)

# Error in `left_join()`:
# ! Can't join on `x$customer_no` x `y$customer_no` because of incompatible types.
# ℹ `x$customer_no` is of type <double>>.
# ℹ `y$customer_no` is of type <character>>.

cust_character |>
  full_join(cust_numeric, by = 'customer_no') |>
  print()
#> Error in `full_join()`:
#> ! Can't join on `x$customer_no` x `y$customer_no` because of
#>   incompatible types.
#> ℹ `x$customer_no` is of type <character>>.
#> ℹ `y$customer_no` is of type <integer>>.

#> Backtrace:
#>      ▆
#>   1. ├─base::print(full_join(cust_character, cust_numeric, by = "customer_no"))
#>   2. ├─dplyr::full_join(cust_character, cust_numeric, by = "customer_no")
#>   3. └─dplyr:::full_join.data.frame(cust_character, cust_numeric, by = "customer_no")
#>   4.   └─dplyr:::join_mutate(...)
#>   5.     └─dplyr:::join_rows(...)
#>   6.       └─base::tryCatch(...)
#>   7.         └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#>   8.           └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#>   9.             └─value[[3L]](cond)
#>  10.               └─rlang::abort(bullets, call = error_call)

# Error in `left_join()`:
# ! Can't join on `x$customer_no` x `y$customer_no` because of incompatible types.
# ℹ `x$customer_no` is of type <character>>.
# ℹ `y$customer_no` is of type <double>>.

# copy to spark

cust_numeric_spark <- copy_to(sc, cust_numeric, overwrite = TRUE)  

cust_character_spark <- copy_to(sc, cust_character, overwrite = TRUE)  

# join in both directions but using spark - no errors

cust_numeric_spark |>
  full_join(cust_character_spark, by = 'customer_no') |>
  print()
#> # Source: spark<?> [?? x 1]
#>   customer_no
#>   <chr>      
#> 1 60000006   
#> 2 10000001   
#> 3 40000004   
#> 4 20000002   
#> 5 70000007   
#> 6 80000008   
#> 7 30000003   
#> 8 90000009   
#> 9 50000005

cust_character_spark |>
  full_join(cust_numeric_spark, by = 'customer_no') |>
  print()
#> # Source: spark<?> [?? x 1]
#>   customer_no
#>   <chr>      
#> 1 60000006   
#> 2 10000001   
#> 3 40000004   
#> 4 20000002   
#> 5 70000007   
#> 6 80000008   
#> 7 30000003   
#> 8 90000009   
#> 9 50000005

Created on 2023-11-27 with reprex v2.0.2

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.1.3 (2022-03-10)
#>  os       RHEL
#>  system   x86_64, linux-gnu
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Pacific/Auckland
#>  date     2023-11-27
#>  pandoc   2.2.3.2 @ /data/disk1/anaconda3/bin/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version date (UTC) lib source
#>  askpass       1.1     2019-01-13 [1] CRAN (R 4.1.0)
#>  assertthat    0.2.1   2019-03-21 [1] CRAN (R 4.1.0)
#>  base64enc     0.1-3   2015-07-28 [1] CRAN (R 4.1.0)
#>  blob          1.2.3   2022-04-10 [1] CRAN (R 4.1.3)
#>  cli           3.4.1   2022-09-23 [1] CRAN (R 4.1.3)
#>  config        0.3.1   2020-12-17 [1] CRAN (R 4.1.0)
#>  DBI           1.1.3   2022-06-18 [1] CRAN (R 4.1.3)
#>  dbplyr        2.2.1   2022-06-27 [1] CRAN (R 4.1.3)
#>  digest        0.6.29  2021-12-01 [1] CRAN (R 4.1.1)
#>  dplyr       * 1.0.10  2022-09-01 [1] CRAN (R 4.1.3)
#>  ellipsis      0.3.2   2021-04-29 [1] CRAN (R 4.1.0)
#>  evaluate      0.16    2022-08-09 [1] CRAN (R 4.1.3)
#>  fansi         1.0.3   2022-03-24 [1] CRAN (R 4.1.2)
#>  fastmap       1.1.0   2021-01-25 [1] CRAN (R 4.1.0)
#>  forge         0.2.0   2019-02-26 [1] CRAN (R 4.1.0)
#>  fs            1.5.2   2021-12-08 [1] CRAN (R 4.1.3)
#>  generics      0.1.3   2022-07-05 [1] CRAN (R 4.1.3)
#>  glue          1.6.2   2022-02-24 [1] CRAN (R 4.1.2)
#>  highr         0.9     2021-04-16 [1] CRAN (R 4.1.0)
#>  htmltools     0.5.3   2022-07-18 [1] CRAN (R 4.1.3)
#>  htmlwidgets   1.5.4   2021-09-08 [1] CRAN (R 4.1.1)
#>  httr          1.4.4   2022-08-17 [1] CRAN (R 4.1.3)
#>  jsonlite      1.8.0   2022-02-22 [1] CRAN (R 4.1.2)
#>  knitr         1.40    2022-08-24 [1] CRAN (R 4.1.3)
#>  lifecycle     1.0.2   2022-09-09 [1] CRAN (R 4.1.3)
#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.1.3)
#>  openssl       2.0.3   2022-09-14 [1] CRAN (R 4.1.3)
#>  pillar        1.8.1   2022-08-19 [1] CRAN (R 4.1.3)
#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.1.0)
#>  purrr         0.3.4   2020-04-17 [1] CRAN (R 4.1.0)
#>  R.cache       0.16.0  2022-07-21 [1] CRAN (R 4.1.3)
#>  R.methodsS3   1.8.2   2022-06-13 [1] CRAN (R 4.1.3)
#>  R.oo          1.25.0  2022-06-12 [1] CRAN (R 4.1.3)
#>  R.utils       2.12.0  2022-06-28 [1] CRAN (R 4.1.3)
#>  r2d3          0.2.6   2022-02-28 [1] CRAN (R 4.1.2)
#>  R6            2.5.1   2021-08-19 [1] CRAN (R 4.1.1)
#>  reprex        2.0.2   2022-08-17 [1] CRAN (R 4.1.3)
#>  rlang         1.0.6   2022-09-24 [1] CRAN (R 4.1.3)
#>  rmarkdown     2.16    2022-08-24 [1] CRAN (R 4.1.3)
#>  rprojroot     2.0.3   2022-04-02 [1] CRAN (R 4.1.3)
#>  rstudioapi    0.14    2022-08-22 [1] CRAN (R 4.1.3)
#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.1.1)
#>  sparklyr    * 1.7.8   2022-08-16 [1] CRAN (R 4.1.3)
#>  stringi       1.7.8   2022-07-11 [1] CRAN (R 4.1.3)
#>  stringr       1.4.1   2022-08-20 [1] CRAN (R 4.1.3)
#>  styler        1.7.0   2022-03-13 [1] CRAN (R 4.1.2)
#>  tibble        3.1.8   2022-07-22 [1] CRAN (R 4.1.3)
#>  tidyr         1.2.1   2022-09-08 [1] CRAN (R 4.1.3)
#>  tidyselect    1.1.2   2022-02-21 [1] CRAN (R 4.1.2)
#>  utf8          1.2.2   2021-07-24 [1] CRAN (R 4.1.0)
#>  vctrs         0.4.1   2022-04-13 [1] CRAN (R 4.1.3)
#>  withr         2.5.0   2022-03-03 [1] CRAN (R 4.1.2)
#>  xfun          0.33    2022-09-12 [1] CRAN (R 4.1.3)
#>  yaml          2.3.5   2022-02-21 [1] CRAN (R 4.1.2)
#> 
#>  [1] /data/disk1/anaconda3/envs/gcenv_R413_202209/lib/R/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────