sparklyr / sparklyr

This code does not really executes, it seems that it immediately goes to the next method

Lines 229 to 239 in d10f06f

    
           remote_name <- sdf_remote_name(out) 
        
           # We then need a separate SQL query to cache the resulting view, as there is 
        
           # no way (yet) to both create and cache a view using a single Spark SQL query. 
        
           tbl_cache( 
        
             sc = spark_connection(x), 
        
             name = as.character(remote_name), 
        
             force = TRUE 
        
           ) 
        
           out

tbl_cache() and friends seem to be only used by the compute().tbl_spark() so it could be removed

sparklyr/R/tables_spark.R

Lines 16 to 69 in d10f06f

    
           tbl_cache_sdf <- function(sc, name, force) { 
        
             tbl <- tbl(sc, name) 
        
             sdf <- spark_dataframe(tbl) 
        
             invoke(sdf, "cache") 
        
             if (force) { 
        
               invoke(sdf, "count") 
        
             } 
        
           } 
        
           tbl_cache_sql <- function(sc, name, force) { 
        
             sql <- paste("CACHE TABLE", tbl_quote_name(sc, name)) 
        
             invoke(hive_context(sc), "sql", sql) 
        
             if (force) { 
        
               sql <- paste("SELECT count(*) FROM ", tbl_quote_name(sc, name)) 
        
               sdf <- invoke(hive_context(sc), "sql", sql) 
        
               sdf_collect(sdf) 
        
             } 
        
           } 
        
           #' Cache a Spark Table 
        
           #' 
        
           #' Force a Spark table with name \code{name} to be loaded into memory. 
        
           #' Operations on cached tables should normally (although not always) 
        
           #' be more performant than the same operation performed on an uncached 
        
           #' table. 
        
           #' 
        
           #' @param sc A \code{spark_connection}. 
        
           #' @param name The table name. 
        
           #' @param force Force the data to be loaded into memory? This is accomplished 
        
           #'   by calling the \code{count} API on the associated Spark DataFrame. 
        
           #' 
        
           #' @export 
        
           tbl_cache <- function(sc, name, force = TRUE) { 
        
             countColumns <- function(sc, name) { 
        
               sql <- sprintf("SELECT * FROM %s LIMIT 0", tbl_quote_name(sc, name)) 
        
               sdf <- invoke(hive_context(sc), "sql", sql) 
        
               length(invoke(sdf, "columns")) 
        
             } 
        
             # We preffer to cache tables using SQL syntax since this would track the 
        
             # table names in logs and ui with a friendly name, say "In-memory table df". 
        
             # Using tbl_cache_sdf is supported for high-number of columns; however, it 
        
             # displays a non-friendly name that we try to avoid. 
        
             if (spark_version(sc) < "2.0.0" && countColumns(sc, name) >= 1000) { 
        
               tbl_cache_sdf(sc, name, force) 
        
             } else { 
        
               tbl_cache_sql(sc, name, force) 
        
             } 
        
             invisible(NULL)

The code is being tested, so no need to remove for now

https://app.codecov.io/gh/sparklyr/sparklyr/blob/main/R%2Fdplyr_spark.R#L224

	remote_name <- sdf_remote_name(out)

	# We then need a separate SQL query to cache the resulting view, as there is
	# no way (yet) to both create and cache a view using a single Spark SQL query.
	tbl_cache(
	sc = spark_connection(x),
	name = as.character(remote_name),
	force = TRUE
	)

	out

	tbl_cache_sdf <- function(sc, name, force) {
	tbl <- tbl(sc, name)
	sdf <- spark_dataframe(tbl)

	invoke(sdf, "cache")
	if (force) {
	invoke(sdf, "count")
	}
	}

	tbl_cache_sql <- function(sc, name, force) {
	sql <- paste("CACHE TABLE", tbl_quote_name(sc, name))
	invoke(hive_context(sc), "sql", sql)

	if (force) {
	sql <- paste("SELECT count(*) FROM ", tbl_quote_name(sc, name))
	sdf <- invoke(hive_context(sc), "sql", sql)
	sdf_collect(sdf)
	}
	}

	#' Cache a Spark Table
	#'
	#' Force a Spark table with name \code{name} to be loaded into memory.
	#' Operations on cached tables should normally (although not always)
	#' be more performant than the same operation performed on an uncached
	#' table.
	#'
	#' @param sc A \code{spark_connection}.
	#' @param name The table name.
	#' @param force Force the data to be loaded into memory? This is accomplished
	#' by calling the \code{count} API on the associated Spark DataFrame.
	#'
	#' @export
	tbl_cache <- function(sc, name, force = TRUE) {
	countColumns <- function(sc, name) {
	sql <- sprintf("SELECT * FROM %s LIMIT 0", tbl_quote_name(sc, name))
	sdf <- invoke(hive_context(sc), "sql", sql)

	length(invoke(sdf, "columns"))
	}

	# We preffer to cache tables using SQL syntax since this would track the
	# table names in logs and ui with a friendly name, say "In-memory table df".
	# Using tbl_cache_sdf is supported for high-number of columns; however, it
	# displays a non-friendly name that we try to avoid.

	if (spark_version(sc) < "2.0.0" && countColumns(sc, name) >= 1000) {
	tbl_cache_sdf(sc, name, force)
	} else {
	tbl_cache_sql(sc, name, force)
	}

	invisible(NULL)

Consider removing `compute.tbl_spark()` code