Consider removing `compute.tbl_spark()` code
edgararuiz opened this issue · comments
Edgar Ruiz commented
This code does not really executes, it seems that it immediately goes to the next method
Lines 229 to 239 in d10f06f
remote_name <- sdf_remote_name(out) | |
# We then need a separate SQL query to cache the resulting view, as there is | |
# no way (yet) to both create and cache a view using a single Spark SQL query. | |
tbl_cache( | |
sc = spark_connection(x), | |
name = as.character(remote_name), | |
force = TRUE | |
) | |
out |
Edgar Ruiz commented
tbl_cache()
and friends seem to be only used by the compute().tbl_spark()
so it could be removed
Lines 16 to 69 in d10f06f
tbl_cache_sdf <- function(sc, name, force) { | |
tbl <- tbl(sc, name) | |
sdf <- spark_dataframe(tbl) | |
invoke(sdf, "cache") | |
if (force) { | |
invoke(sdf, "count") | |
} | |
} | |
tbl_cache_sql <- function(sc, name, force) { | |
sql <- paste("CACHE TABLE", tbl_quote_name(sc, name)) | |
invoke(hive_context(sc), "sql", sql) | |
if (force) { | |
sql <- paste("SELECT count(*) FROM ", tbl_quote_name(sc, name)) | |
sdf <- invoke(hive_context(sc), "sql", sql) | |
sdf_collect(sdf) | |
} | |
} | |
#' Cache a Spark Table | |
#' | |
#' Force a Spark table with name \code{name} to be loaded into memory. | |
#' Operations on cached tables should normally (although not always) | |
#' be more performant than the same operation performed on an uncached | |
#' table. | |
#' | |
#' @param sc A \code{spark_connection}. | |
#' @param name The table name. | |
#' @param force Force the data to be loaded into memory? This is accomplished | |
#' by calling the \code{count} API on the associated Spark DataFrame. | |
#' | |
#' @export | |
tbl_cache <- function(sc, name, force = TRUE) { | |
countColumns <- function(sc, name) { | |
sql <- sprintf("SELECT * FROM %s LIMIT 0", tbl_quote_name(sc, name)) | |
sdf <- invoke(hive_context(sc), "sql", sql) | |
length(invoke(sdf, "columns")) | |
} | |
# We preffer to cache tables using SQL syntax since this would track the | |
# table names in logs and ui with a friendly name, say "In-memory table df". | |
# Using tbl_cache_sdf is supported for high-number of columns; however, it | |
# displays a non-friendly name that we try to avoid. | |
if (spark_version(sc) < "2.0.0" && countColumns(sc, name) >= 1000) { | |
tbl_cache_sdf(sc, name, force) | |
} else { | |
tbl_cache_sql(sc, name, force) | |
} | |
invisible(NULL) |
Edgar Ruiz commented
The code is being tested, so no need to remove for now
https://app.codecov.io/gh/sparklyr/sparklyr/blob/main/R%2Fdplyr_spark.R#L224